diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS index 8541e954b9..71519d1ad8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/AUTHORS @@ -10,6 +10,7 @@ generates includes the following authors: Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) @@ -28,5 +29,4 @@ acknowledged collaboration with the following collaborators: Taran Singhania (PES University Bangalore) David Smith (CERN) Carl Vuosalo (University of Wisconsin-Madison) - Joergen Teig (CERN) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc index 3c231bdbd6..54ce4c64cf 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_cc.inc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s @@ -15,7 +15,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc index 0250c160ed..94b8dd6444 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/cpp_model_parameters_h.inc @@ -25,7 +25,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -85,7 +85,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -155,7 +155,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -172,7 +172,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -196,7 +196,7 @@ namespace mg5amcCpu %(eftspecial2)s return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h rename to epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/CudaRuntime.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h index 1afc589b11..b4b76f3842 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h index 48306a9d41..f29b8c5357 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_%(model_name)s.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc index 562af241af..b9840f1374 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cpp_hel_amps_h.inc @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for %(output_name)s by %(info_lines)s @@ -26,7 +26,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk index b399eb36b0..c570d2418c 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %%bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %%/bin/nvcc,%%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %%/bin/hipcc,%%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%%.o : %%.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%%_cu.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%%.o : %%.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %%bin/nvc++,%%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk index 25b6f8f7c8..49ccf0c4e3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%%.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%%_cu.o : %%.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_%(model)s_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h index 1175622ff4..05013cf981 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) %(mgongpu_supports_multichannel)s +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc index 778e210468..815fd8d5b7 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_cc.inc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s @@ -14,7 +14,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" %(hel_amps_h)s #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc index 5f1ea36b9e..9d024183db 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_function_definitions.inc @@ -4,13 +4,13 @@ ! Copyright (C) 2020-2023 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== //========================================================================== // Class member functions for calculating the matrix elements for %(process_lines)s -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -44,7 +44,7 @@ namespace mg5amcCpu %(cipdhrdcod)s %(cipchrdcod)s #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL %(cipddevice)s %(cipcdevice)s #else @@ -54,7 +54,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -80,8 +80,8 @@ namespace mg5amcCpu // Helicities for the process [NB do keep 'static' for this constexpr array, see issue #283] // *** NB There is no automatic check yet that these are in the same order as Fortran! #569 *** %(all_helicities)s -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -117,7 +117,7 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory %(cipdassign)s %(cipcassign)s -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL %(cipd2tipdSym)s %(cipc2tipcSym)s #else @@ -150,7 +150,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -179,6 +179,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -192,6 +196,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -215,12 +221,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -241,7 +247,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -367,9 +373,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -393,7 +399,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -413,7 +419,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { %(den_factors)s }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc index 893f7f3215..2c3adf57e2 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_h.inc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by %(info_lines)s @@ -23,7 +23,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -32,7 +32,7 @@ namespace mg5amcCpu %(process_class_definitions)s //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -45,7 +45,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -75,7 +75,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc index 1e473edcf8..960f029d8d 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_matrix.inc @@ -4,7 +4,7 @@ ! Copyright (C) 2020-2023 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== // *** COLOR CHOICE BELOW *** @@ -17,7 +17,7 @@ // (This method used to be called %(process_class_name)s::matrix_%(proc_name)s(%(matrix_args)s)?) %(color_matrix_lines)s -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -74,7 +74,7 @@ #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -133,7 +133,7 @@ MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%%6d ihel=%%2d me_running=%%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc index 159e3d8d5d..66450ae367 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/process_sigmaKin_function.inc @@ -4,11 +4,14 @@ ! Copyright (C) 2020-2023 CERN and UCLouvain. ! Licensed under the GNU Lesser General Public License (version 3 or later). ! Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -! Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +! Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. !========================================================================== + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -36,7 +39,7 @@ // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -246,7 +249,7 @@ // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc index 2d1578cb43..dbe151e990 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/gpu/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt %% neppM == 0 ); // nevt must be a multiple of neppM assert( nevt %% neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/read_slha.cc b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/read_slha.cc +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/madgraph/iolibs/template_files/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py index 83b61a9565..3e0ebe545f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/model_handling.py @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: O. Mattelaer (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. import os @@ -1110,7 +1110,7 @@ def get_process_function_definitions(self, write=True): %(len(coupling_indep), ' ), cxmake( m_pars->'.join(coupling_indep)) # AV only indep! replace_dict['cipcdevice'] = '__device__ __constant__ fptype cIPC[%i];'%(2*len(coupling_indep)) replace_dict['cipcstatic'] = 'static fptype cIPC[%i];'%(2*len(coupling_indep)) - replace_dict['cipc2tipcSym'] = 'checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ) );'%len(coupling_indep) + replace_dict['cipc2tipcSym'] = 'gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep) replace_dict['cipc2tipc'] = 'memcpy( cIPC, tIPC, %i * sizeof( cxtype ) );'%len(coupling_indep) replace_dict['cipcdump'] = '\n //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPC[i] = " << tIPC[i] << std::endl;'%len(coupling_indep) coup_str_hrd = '__device__ const fptype cIPC[%s] = { ' % (len(coupling_indep)*2) @@ -1121,7 +1121,7 @@ def get_process_function_definitions(self, write=True): replace_dict['cipcassign'] = '//const cxtype tIPC[0] = { ... }; // nicoup=0' replace_dict['cipcdevice'] = '__device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0' replace_dict['cipcstatic'] = 'static fptype* cIPC = nullptr; // unused as nicoup=0' - replace_dict['cipc2tipcSym'] = '//checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ) ); // nicoup=0'%len(coupling_indep) + replace_dict['cipc2tipcSym'] = '//gpuMemcpyToSymbol( cIPC, tIPC, %i * sizeof( cxtype ) ); // nicoup=0'%len(coupling_indep) replace_dict['cipc2tipc'] = '//memcpy( cIPC, tIPC, %i * sizeof( cxtype ) ); // nicoup=0'%len(coupling_indep) replace_dict['cipcdump'] = '' replace_dict['cipchrdcod'] = '__device__ const fptype* cIPC = nullptr; // unused as nicoup=0' @@ -1130,7 +1130,7 @@ def get_process_function_definitions(self, write=True): %(len(params), ', (fptype)m_pars->'.join(params)) replace_dict['cipddevice'] = '__device__ __constant__ fptype cIPD[%i];'%(len(params)) replace_dict['cipdstatic'] = 'static fptype cIPD[%i];'%(len(params)) - replace_dict['cipd2tipdSym'] = 'checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ) );'%len(params) + replace_dict['cipd2tipdSym'] = 'gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params) replace_dict['cipd2tipd'] = 'memcpy( cIPD, tIPD, %i * sizeof( fptype ) );'%len(params) replace_dict['cipddump'] = '\n //for ( i=0; i<%i; i++ ) std::cout << std::setprecision(17) << "tIPD[i] = " << tIPD[i] << std::endl;'%len(params) param_str_hrd = '__device__ const fptype cIPD[%s] = { ' % len(params) @@ -1141,7 +1141,7 @@ def get_process_function_definitions(self, write=True): replace_dict['cipdassign'] = '//const fptype tIPD[0] = { ... }; // nparam=0' replace_dict['cipddevice'] = '//__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0' replace_dict['cipdstatic'] = '//static fptype* cIPD = nullptr; // unused as nparam=0' - replace_dict['cipd2tipdSym'] = '//checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ) ); // nparam=0'%len(params) + replace_dict['cipd2tipdSym'] = '//gpuMemcpyToSymbol( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params) replace_dict['cipd2tipd'] = '//memcpy( cIPD, tIPD, %i * sizeof( fptype ) ); // nparam=0'%len(params) replace_dict['cipddump'] = '' replace_dict['cipdhrdcod'] = '//__device__ const fptype* cIPD = nullptr; // unused as nparam=0' @@ -1219,13 +1219,13 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -1252,7 +1252,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( \"calculate_wavefunctions: ihel=%2d\\n\", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( \"calculate_wavefunctions: ievt00=%d\\n\", ievt00 ); #endif""") nwavefuncs = self.matrix_elements[0].get_number_of_wavefunctions() @@ -1289,7 +1289,7 @@ def get_all_sigmaKin_lines(self, color_amplitudes, class_name): #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif""") ret_lines += helas_calls @@ -1718,8 +1718,10 @@ def super_get_matrix_element_calls(self, matrix_element, color_amplitudes, multi allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -1835,7 +1837,7 @@ def get_external(self, wf, argument): split_line2 = [ str.lstrip(' ').rstrip(' ') for str in split_line2] # AV split_line2.insert(2, '0') # add parameter fmass=0 line2 = ', '.join(split_line2) - text = '#if not( defined __CUDACC__ and defined MGONGPU_TEST_DIVERGENCE )\n %s\n#else\n if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n %s\n else\n %s\n#endif\n' # AV + text = '#if not( defined MGONGPUCPP_GPUIMPL and defined MGONGPU_TEST_DIVERGENCE )\n %s\n#else\n if( ( blockDim.x * blockIdx.x + threadIdx.x ) %% 2 == 0 )\n %s\n else\n %s\n#endif\n' # AV return text % (line, line, line2) text = '%s\n' # AV return text % line diff --git a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py index 5267141530..c89295c01f 100644 --- a/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py +++ b/epochX/cudacpp/CODEGEN/PLUGIN/CUDACPP_SA_OUTPUT/output.py @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2021-2023) for the MG5aMC CUDACPP plugin. import os import subprocess @@ -88,9 +88,9 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): 'CMake': [s+'CMake/Compilers.txt', s+'CMake/Platforms.txt', s+'CMake/Macros.txt'], 'src': [s+'gpu/rambo.h', s+'read_slha.h', s+'read_slha.cc', s+'gpu/mgOnGpuFptypes.h', s+'gpu/mgOnGpuCxtypes.h', s+'gpu/mgOnGpuVectors.h', - s+'CMake/src/CMakeLists.txt'], + s+'CMake/src/CMakeLists.txt' ], 'SubProcesses': [s+'gpu/nvtx.h', s+'gpu/timer.h', s+'gpu/timermap.h', - s+'gpu/ompnumthreads.h', s+'gpu/CudaRuntime.h', + s+'gpu/ompnumthreads.h', s+'gpu/GpuRuntime.h', s+'gpu/GpuAbstraction.h', s+'gpu/MemoryAccessHelpers.h', s+'gpu/MemoryAccessVectors.h', s+'gpu/MemoryAccessMatrixElements.h', s+'gpu/MemoryAccessMomenta.h', s+'gpu/MemoryAccessRandomNumbers.h', s+'gpu/MemoryAccessWeights.h', @@ -111,7 +111,7 @@ class PLUGIN_ProcessExporter(PLUGIN_export_cpp.ProcessExporterGPU): s+'CMake/SubProcesses/CMakeLists.txt'], 'test': [s+'gpu/cudacpp_test.mk']} to_link_in_P = ['nvtx.h', 'timer.h', 'timermap.h', - 'ompnumthreads.h', 'CudaRuntime.h', + 'ompnumthreads.h', 'GpuRuntime.h', 'GpuAbstraction.h', 'MemoryAccessHelpers.h', 'MemoryAccessVectors.h', 'MemoryAccessMatrixElements.h', 'MemoryAccessMomenta.h', 'MemoryAccessRandomNumbers.h', 'MemoryAccessWeights.h', diff --git a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt index 36b42987c5..a484a3ce73 100644 --- a/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.mad/CODEGEN_mad_ee_mumu_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005498409271240234  +DEBUG: model prefixing takes 0.0055043697357177734  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -174,7 +174,7 @@ INFO: Generating Helas calls for process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Processing color information for process: e+ e- > mu+ mu- @1 INFO: Creating files in directory P1_epem_mupmum DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,19 +191,19 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group epem_mupmum Generated helas calls for 1 subprocesses (2 diagrams) in 0.004 s -Wrote files for 8 helas calls in 0.102 s +Wrote files for 8 helas calls in 0.098 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines -ALOHA: aloha creates 3 routines in 0.203 s +ALOHA: aloha creates 3 routines in 0.198 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 7 routines in 0.260 s +ALOHA: aloha creates 7 routines in 0.252 s FFV1 FFV1 FFV2 @@ -248,9 +248,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.900s -user 0m1.697s -sys 0m0.195s +real 0m1.882s +user 0m1.658s +sys 0m0.191s Code generation completed in 2 seconds ************************************************************ * * @@ -277,7 +277,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -307,7 +307,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_ee_mumu/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/ee_mumu.mad/COPYRIGHT +++ b/epochX/cudacpp/ee_mumu.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc index 9193aa2382..c57ff8d2b0 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ }; __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() }; #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype cIPC[6]; #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -286,7 +287,7 @@ namespace mg5amcCpu // [NB do keep 'static' for these constexpr arrays, see issue #283] static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -343,7 +344,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -402,7 +403,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -449,8 +450,8 @@ namespace mg5amcCpu { 1, -1, 1, 1 }, { 1, -1, -1, -1 }, { 1, -1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -490,9 +491,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ }; const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ); #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) ); @@ -529,7 +530,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -558,6 +559,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -571,6 +576,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -594,12 +601,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -620,7 +627,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -746,9 +753,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -772,7 +779,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -792,7 +799,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -806,9 +813,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -836,7 +846,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1046,7 +1056,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h index 77b610753c..0b29ffb3ff 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/ee_mumu.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/ee_mumu.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h index 9fa30cfd7f..e878fcd28e 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc index 0b4be4d5ed..cffc5d3bff 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h index 64d0b8e761..2a6d960581 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -235,7 +235,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -259,7 +259,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h index 80032e528b..6bde4466d0 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/ee_mumu.mad/src/rambo.h b/epochX/cudacpp/ee_mumu.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/rambo.h +++ b/epochX/cudacpp/ee_mumu.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/ee_mumu.mad/src/read_slha.cc b/epochX/cudacpp/ee_mumu.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/ee_mumu.mad/src/read_slha.cc +++ b/epochX/cudacpp/ee_mumu.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt index 636fab0372..2764fbfcfb 100644 --- a/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt +++ b/epochX/cudacpp/ee_mumu.sa/CODEGEN_cudacpp_ee_mumu_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate e+ e- > mu+ mu- No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.00569605827331543  +DEBUG: model prefixing takes 0.00559234619140625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -181,7 +181,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates FFV2 routines ALOHA: aloha creates FFV4 routines ALOHA: aloha creates FFV2_4 routines -ALOHA: aloha creates 4 routines in 0.271 s +ALOHA: aloha creates 4 routines in 0.266 s FFV1 FFV1 FFV2 @@ -200,7 +200,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_ee_mumu/src/. quit -real 0m0.662s -user 0m0.604s -sys 0m0.052s +real 0m0.654s +user 0m0.591s +sys 0m0.054s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/ee_mumu.sa/COPYRIGHT +++ b/epochX/cudacpp/ee_mumu.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 87bcecccd9..b87b14d41f 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: e+ e- > mu+ mu- WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MZ, (fptype)Parameters_sm::mdl_WZ }; __device__ const fptype cIPC[6] = { (fptype)Parameters_sm::GC_3.real(), (fptype)Parameters_sm::GC_3.imag(), (fptype)Parameters_sm::GC_50.real(), (fptype)Parameters_sm::GC_50.imag(), (fptype)Parameters_sm::GC_59.real(), (fptype)Parameters_sm::GC_59.imag() }; #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype cIPC[6]; #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -284,7 +285,7 @@ namespace mg5amcCpu // [NB do keep 'static' for these constexpr arrays, see issue #283] static constexpr fptype2 cf[ncolor][ncolor] = { { 1 } }; // 2-D array[1][1] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -341,7 +342,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -400,7 +401,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -447,8 +448,8 @@ namespace mg5amcCpu { 1, -1, 1, 1 }, { 1, -1, -1, -1 }, { 1, -1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -488,9 +489,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MZ, (fptype)m_pars->mdl_WZ }; const cxtype tIPC[3] = { cxmake( m_pars->GC_3 ), cxmake( m_pars->GC_50 ), cxmake( m_pars->GC_59 ) }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + gpuMemcpyToSymbol( cIPC, tIPC, 3 * sizeof( cxtype ) ); #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); memcpy( cIPC, tIPC, 3 * sizeof( cxtype ) ); @@ -527,7 +528,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -556,6 +557,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -569,6 +574,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -592,12 +599,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -618,7 +625,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -744,9 +751,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -770,7 +777,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -790,7 +797,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 4 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -804,9 +811,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -834,7 +844,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1044,7 +1054,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h index 77b610753c..0b29ffb3ff 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/ee_mumu.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h index 9fa30cfd7f..e878fcd28e 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc index 0b4be4d5ed..cffc5d3bff 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h index 64d0b8e761..2a6d960581 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/ee_mumu.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -235,7 +235,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -259,7 +259,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/ee_mumu.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h index b247654dcf..475749ca7c 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/ee_mumu.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/ee_mumu.sa/src/rambo.h b/epochX/cudacpp/ee_mumu.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/rambo.h +++ b/epochX/cudacpp/ee_mumu.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/ee_mumu.sa/src/read_slha.cc b/epochX/cudacpp/ee_mumu.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/ee_mumu.sa/src/read_slha.cc +++ b/epochX/cudacpp/ee_mumu.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt index a477013568..5782086b56 100644 --- a/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.mad/CODEGEN_mad_gg_tt_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005816459655761719  +DEBUG: model prefixing takes 0.005559206008911133  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -191,16 +191,16 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s -Wrote files for 10 helas calls in 0.103 s +Wrote files for 10 helas calls in 0.100 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.155 s +ALOHA: aloha creates 2 routines in 0.146 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 4 routines in 0.135 s +ALOHA: aloha creates 4 routines in 0.132 s VVV1 FFV1 FFV1 @@ -237,9 +237,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m1.729s -user 0m1.515s -sys 0m0.204s +real 0m1.699s +user 0m1.467s +sys 0m0.225s Code generation completed in 2 seconds ************************************************************ * * @@ -266,7 +266,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -296,7 +296,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_tt.mad/COPYRIGHT b/epochX/cudacpp/gg_tt.mad/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gg_tt.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h similarity index 62% rename from epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h rename to epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h index 64ce52f4b3..93579ef08b 100644 --- a/epochX/cudacpp/ee_mumu.sa/SubProcesses/CudaRuntime.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/GpuRuntime.h @@ -1,49 +1,50 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 // MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API // See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api -#include +#include "GpuAbstraction.h" + #include //-------------------------------------------------------------------------- // See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) { - if( code != cudaSuccess ) + if( code != gpuSuccess ) { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); } } #endif /* clang-format on */ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final + struct GpuRuntime final { - CudaRuntime( const bool debug = true ) + GpuRuntime( const bool debug = true ) : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; bool m_debug; // Set up CUDA application @@ -62,8 +63,8 @@ namespace mg5amcGpu */ // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! } // Tear down CUDA application (call cudaDeviceReset) @@ -72,14 +73,13 @@ namespace mg5amcGpu // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking static void tearDown( const bool debug = true ) { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); } }; - } #endif //-------------------------------------------------------------------------- -#endif // MG5AMC_CUDARUNTIME_H +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 18052b6676..dbaa56b35c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -302,7 +303,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -359,7 +360,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -418,7 +419,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -465,8 +466,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -506,9 +507,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -544,7 +545,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -573,6 +574,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -586,6 +591,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -609,12 +616,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -635,7 +642,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -761,9 +768,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -787,7 +794,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -807,7 +814,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -821,9 +828,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -851,7 +861,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1061,7 +1071,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3ebd92c038..4a88a07226 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h index 55f43bb43a..add8fce575 100644 --- a/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc index a9bc93ff98..c5dd6e7e4c 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h index 932f123fea..5f2f4391b9 100644 --- a/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h index 80032e528b..6bde4466d0 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt.mad/src/rambo.h b/epochX/cudacpp/gg_tt.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_tt.mad/src/rambo.h +++ b/epochX/cudacpp/gg_tt.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_tt.mad/src/read_slha.cc b/epochX/cudacpp/gg_tt.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_tt.mad/src/read_slha.cc +++ b/epochX/cudacpp/gg_tt.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt index 0db09949ad..9d7cc87630 100644 --- a/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt +++ b/epochX/cudacpp/gg_tt.sa/CODEGEN_cudacpp_gg_tt_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005459308624267578  +DEBUG: model prefixing takes 0.005457878112792969  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -180,7 +180,7 @@ Generated helas calls for 1 subprocesses (3 diagrams) in 0.006 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 set of routines with options: P0 ALOHA: aloha creates FFV1 routines -ALOHA: aloha creates 2 routines in 0.146 s +ALOHA: aloha creates 2 routines in 0.142 s VVV1 FFV1 FFV1 @@ -195,7 +195,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_tt/src/. quit -real 0m0.545s -user 0m0.487s -sys 0m0.049s +real 0m0.546s +user 0m0.467s +sys 0m0.054s Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_tt.sa/COPYRIGHT b/epochX/cudacpp/gg_tt.sa/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gg_tt.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_tt.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc index d390883453..c51f01c456 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -299,7 +300,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -356,7 +357,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -415,7 +416,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -462,8 +463,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -503,9 +504,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -541,7 +542,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -570,6 +571,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -583,6 +588,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -606,12 +613,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -632,7 +639,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -758,9 +765,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -784,7 +791,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -804,7 +811,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -818,9 +825,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -848,7 +858,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1058,7 +1068,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h index 3ebd92c038..4a88a07226 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/P1_Sigma_sm_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h index 55f43bb43a..add8fce575 100644 --- a/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc index a9bc93ff98..c5dd6e7e4c 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h index 932f123fea..5f2f4391b9 100644 --- a/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h index b247654dcf..475749ca7c 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt.sa/src/rambo.h b/epochX/cudacpp/gg_tt.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_tt.sa/src/rambo.h +++ b/epochX/cudacpp/gg_tt.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_tt.sa/src/read_slha.cc b/epochX/cudacpp/gg_tt.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_tt.sa/src/read_slha.cc +++ b/epochX/cudacpp/gg_tt.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt index b3d319e039..f5287cc1ca 100644 --- a/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt +++ b/epochX/cudacpp/gg_tt01g.mad/CODEGEN_mad_gg_tt01g_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005671977996826172  +DEBUG: model prefixing takes 0.005505561828613281  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -163,7 +163,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @2 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.020 s +1 processes with 16 diagrams generated in 0.019 s Total: 2 processes with 19 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_tt01g --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -185,7 +185,7 @@ INFO: Generating Helas calls for process: g g > t t~ WEIGHTED<=2 @1 INFO: Processing color information for process: g g > t t~ @1 INFO: Creating files in directory P2_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -202,7 +202,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P1_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -217,8 +217,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttx -Generated helas calls for 2 subprocesses (19 diagrams) in 0.044 s -Wrote files for 46 helas calls in 0.247 s +Generated helas calls for 2 subprocesses (19 diagrams) in 0.042 s +Wrote files for 46 helas calls in 0.243 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines @@ -233,7 +233,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.316 s +ALOHA: aloha creates 10 routines in 0.311 s VVV1 VVV1 FFV1 @@ -283,10 +283,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.334s -user 0m2.083s -sys 0m0.238s -Code generation completed in 2 seconds +real 0m2.369s +user 0m2.050s +sys 0m0.252s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -312,7 +312,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -342,7 +342,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_tt01g/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_tt01g.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc index 18052b6676..dbaa56b35c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -302,7 +303,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -359,7 +360,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -418,7 +419,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -465,8 +466,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -506,9 +507,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -544,7 +545,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -573,6 +574,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -586,6 +591,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -609,12 +616,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -635,7 +642,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -761,9 +768,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -787,7 +794,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -807,7 +814,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -821,9 +828,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -851,7 +861,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1061,7 +1071,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h index 3ebd92c038..4a88a07226 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P1_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc index bfab81142d..f80a0127b0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -505,7 +506,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -562,7 +563,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -621,7 +622,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -684,8 +685,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -726,9 +727,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -765,7 +766,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -794,6 +795,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -807,6 +812,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -830,12 +837,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -856,7 +863,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -982,9 +989,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1008,7 +1015,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1028,7 +1035,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1042,9 +1049,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1072,7 +1082,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1282,7 +1292,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h index 3901ddcb20..d4b3c0445c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/P2_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_tt01g.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_tt01g.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h index 361b488401..0dd0f3ebba 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_tt01g.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h index 80032e528b..6bde4466d0 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/rambo.h +++ b/epochX/cudacpp/gg_tt01g.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_tt01g.mad/src/read_slha.cc b/epochX/cudacpp/gg_tt01g.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_tt01g.mad/src/read_slha.cc +++ b/epochX/cudacpp/gg_tt01g.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt index 37ba5c7297..ffc3d1d3ef 100644 --- a/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.mad/CODEGEN_mad_gg_ttg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005791187286376953  +DEBUG: model prefixing takes 0.005362510681152344  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=3: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g WEIGHTED<=3 @1 INFO: Process has 16 diagrams -1 processes with 16 diagrams generated in 0.022 s +1 processes with 16 diagrams generated in 0.021 s Total: 1 processes with 16 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g WEIGHTED<=3 @1 INFO: Processing color information for process: g g > t t~ g @1 INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,8 +190,8 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg -Generated helas calls for 1 subprocesses (16 diagrams) in 0.039 s -Wrote files for 36 helas calls in 0.153 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +Wrote files for 36 helas calls in 0.149 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 10 routines in 0.315 s +ALOHA: aloha creates 10 routines in 0.308 s VVV1 VVV1 FFV1 @@ -252,10 +252,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.208s -user 0m1.988s -sys 0m0.221s -Code generation completed in 2 seconds +real 0m2.177s +user 0m1.931s +sys 0m0.233s +Code generation completed in 3 seconds ************************************************************ * * * W E L C O M E to * @@ -281,7 +281,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -311,7 +311,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttg.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index afeebde3c6..2fa9b4f651 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -505,7 +506,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -562,7 +563,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -621,7 +622,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -684,8 +685,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -726,9 +727,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -765,7 +766,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -794,6 +795,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -807,6 +812,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -830,12 +837,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -856,7 +863,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -982,9 +989,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1008,7 +1015,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1028,7 +1035,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1042,9 +1049,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1072,7 +1082,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1282,7 +1292,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 37d6ebe981..11f562273e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttg.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttg.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h index 361b488401..0dd0f3ebba 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h index 80032e528b..6bde4466d0 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttg.mad/src/rambo.h b/epochX/cudacpp/gg_ttg.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/rambo.h +++ b/epochX/cudacpp/gg_ttg.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttg.mad/src/read_slha.cc b/epochX/cudacpp/gg_ttg.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttg.mad/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttg.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt index adda711aad..f034db4427 100644 --- a/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt +++ b/epochX/cudacpp/gg_ttg.sa/CODEGEN_cudacpp_gg_ttg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005533933639526367  +DEBUG: model prefixing takes 0.00534820556640625  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/SubProcesses/P1_Sigma_sm_gg_ttxg/. -Generated helas calls for 1 subprocesses (16 diagrams) in 0.037 s +Generated helas calls for 1 subprocesses (16 diagrams) in 0.036 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 set of routines with options: P0 ALOHA: aloha creates VVVV3 set of routines with options: P0 ALOHA: aloha creates VVVV4 set of routines with options: P0 -ALOHA: aloha creates 5 routines in 0.328 s +ALOHA: aloha creates 5 routines in 0.325 s VVV1 VVV1 FFV1 @@ -203,7 +203,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttg/src/. quit -real 0m0.787s -user 0m0.730s -sys 0m0.049s -Code generation completed in 0 seconds +real 0m0.783s +user 0m0.718s +sys 0m0.051s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttg.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc index 2988a13b82..661197ace8 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -499,7 +500,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -556,7 +557,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -615,7 +616,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -678,8 +679,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -720,9 +721,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -759,7 +760,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -788,6 +789,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -801,6 +806,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -824,12 +831,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -850,7 +857,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -976,9 +983,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1002,7 +1009,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1022,7 +1029,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1036,9 +1043,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1066,7 +1076,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1276,7 +1286,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h index 37d6ebe981..11f562273e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/P1_Sigma_sm_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttg.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h index 361b488401..0dd0f3ebba 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttg.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttg.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h index b247654dcf..475749ca7c 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttg.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttg.sa/src/rambo.h b/epochX/cudacpp/gg_ttg.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/rambo.h +++ b/epochX/cudacpp/gg_ttg.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttg.sa/src/read_slha.cc b/epochX/cudacpp/gg_ttg.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttg.sa/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttg.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt index 2c2fae1608..0da89f1729 100644 --- a/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.mad/CODEGEN_mad_gg_ttgg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0057299137115478516  +DEBUG: model prefixing takes 0.005736112594604492  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.163 s +1 processes with 123 diagrams generated in 0.160 s Total: 1 processes with 123 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttgg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Generating Helas calls for process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Processing color information for process: g g > t t~ g g @1 INFO: Creating files in directory P1_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -190,15 +190,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg -Generated helas calls for 1 subprocesses (123 diagrams) in 0.433 s -Wrote files for 222 helas calls in 0.711 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.423 s +Wrote files for 222 helas calls in 0.683 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.336 s +ALOHA: aloha creates 5 routines in 0.328 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -206,7 +206,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.327 s +ALOHA: aloha creates 10 routines in 0.310 s VVV1 VVV1 FFV1 @@ -255,9 +255,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m3.329s -user 0m3.091s -sys 0m0.226s +real 0m3.254s +user 0m3.011s +sys 0m0.236s Code generation completed in 4 seconds ************************************************************ * * @@ -284,7 +284,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -314,7 +314,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttgg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttgg.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc index 19bc1e7973..0bb184f0e6 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -2417,7 +2418,7 @@ namespace mg5amcCpu { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -2474,7 +2475,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -2533,7 +2534,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -2628,8 +2629,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -2671,9 +2672,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -2711,7 +2712,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -2740,6 +2741,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -2753,6 +2758,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -2776,12 +2783,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -2802,7 +2809,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -2928,9 +2935,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -2954,7 +2961,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -2974,7 +2981,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -2988,9 +2995,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -3018,7 +3028,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -3228,7 +3238,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h index 04f7c62976..deb1358992 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttgg.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttgg.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h index 80032e528b..6bde4466d0 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/rambo.h +++ b/epochX/cudacpp/gg_ttgg.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttgg.mad/src/read_slha.cc b/epochX/cudacpp/gg_ttgg.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttgg.mad/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttgg.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt index 3c3686e228..9ebee16fdf 100644 --- a/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt +++ b/epochX/cudacpp/gg_ttgg.sa/CODEGEN_cudacpp_gg_ttgg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005596637725830078  +DEBUG: model prefixing takes 0.005690574645996094  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=4: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g WEIGHTED<=4 @1 INFO: Process has 123 diagrams -1 processes with 123 diagrams generated in 0.166 s +1 processes with 123 diagrams generated in 0.157 s Total: 1 processes with 123 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttgg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/SubProcesses/P1_Sigma_sm_gg_ttxgg/. -Generated helas calls for 1 subprocesses (123 diagrams) in 0.442 s +Generated helas calls for 1 subprocesses (123 diagrams) in 0.432 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.337 s +ALOHA: aloha creates 5 routines in 0.323 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttgg/src/. quit -real 0m1.506s -user 0m1.438s -sys 0m0.059s +real 0m1.851s +user 0m1.398s +sys 0m0.060s Code generation completed in 2 seconds diff --git a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttgg.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc index f9016eaa88..f2a85b9b75 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g WEIGHTED<=4 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -2474,7 +2475,7 @@ namespace mg5amcCpu { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -2531,7 +2532,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -2590,7 +2591,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -2685,8 +2686,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -2728,9 +2729,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -2768,7 +2769,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -2797,6 +2798,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -2810,6 +2815,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -2833,12 +2840,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -2859,7 +2866,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -2985,9 +2992,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -3011,7 +3018,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -3031,7 +3038,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -3045,9 +3052,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -3075,7 +3085,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -3285,7 +3295,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h index 04f7c62976..deb1358992 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/P1_Sigma_sm_gg_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttgg.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttgg.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h index b247654dcf..475749ca7c 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/rambo.h +++ b/epochX/cudacpp/gg_ttgg.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttgg.sa/src/read_slha.cc b/epochX/cudacpp/gg_ttgg.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttgg.sa/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttgg.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt index 2480a22f8d..37ad313b62 100644 --- a/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.mad/CODEGEN_mad_gg_ttggg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005784511566162109  +DEBUG: model prefixing takes 0.005320072174072266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.929 s +1 processes with 1240 diagrams generated in 1.852 s Total: 1 processes with 1240 diagrams output madevent ../TMPOUT/CODEGEN_mad_gg_ttggg --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -177,7 +177,7 @@ INFO: Creating files in directory P1_gg_ttxggg INFO: Computing Color-Flow optimization [15120 term] INFO: Color-Flow passed to 1630 term in 8s. Introduce 3030 contraction DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -192,15 +192,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxggg -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.718 s -Wrote files for 2281 helas calls in 18.893 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.532 s +Wrote files for 2281 helas calls in 18.428 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.322 s +ALOHA: aloha creates 5 routines in 0.343 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -208,7 +208,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.319 s +ALOHA: aloha creates 10 routines in 0.327 s VVV1 VVV1 FFV1 @@ -257,10 +257,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m29.815s -user 0m29.332s -sys 0m0.380s -Code generation completed in 30 seconds +real 0m29.153s +user 0m28.584s +sys 0m0.440s +Code generation completed in 29 seconds ************************************************************ * * * W E L C O M E to * @@ -286,7 +286,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -316,7 +316,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gg_ttggg/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT +++ b/epochX/cudacpp/gg_ttggg.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc index 19e6cd201c..a041636caf 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g g WEIGHTED<=5 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -30018,7 +30019,7 @@ namespace mg5amcCpu { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -30075,7 +30076,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -30134,7 +30135,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -30293,8 +30294,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1, 1 }, { 1, 1, 1, -1, 1, 1, -1 }, { 1, 1, 1, -1, 1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -30337,9 +30338,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -30378,7 +30379,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -30407,6 +30408,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -30420,6 +30425,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -30443,12 +30450,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -30469,7 +30476,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -30595,9 +30602,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -30621,7 +30628,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -30641,7 +30648,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -30655,9 +30662,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -30685,7 +30695,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -30895,7 +30905,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h index 2565923dde..fff95b66e2 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttggg.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gg_ttggg.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h index 80032e528b..6bde4466d0 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/rambo.h +++ b/epochX/cudacpp/gg_ttggg.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttggg.mad/src/read_slha.cc b/epochX/cudacpp/gg_ttggg.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttggg.mad/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttggg.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt index 0970bf8b4c..382962d284 100644 --- a/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt +++ b/epochX/cudacpp/gg_ttggg.sa/CODEGEN_cudacpp_gg_ttggg_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,7 +62,7 @@ generate g g > t t~ g g g No model currently active, so we import the Standard Model INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005753755569458008  +DEBUG: model prefixing takes 0.005497932434082031  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -155,7 +155,7 @@ INFO: Please specify coupling orders to bypass this step. INFO: Trying coupling order WEIGHTED<=5: WEIGTHED IS QCD+2*QED INFO: Trying process: g g > t t~ g g g WEIGHTED<=5 @1 INFO: Process has 1240 diagrams -1 processes with 1240 diagrams generated in 1.912 s +1 processes with 1240 diagrams generated in 1.865 s Total: 1 processes with 1240 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gg_ttggg Load PLUGIN.CUDACPP_OUTPUT @@ -175,7 +175,7 @@ INFO: Creating files in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TM FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.h FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/./CPPProcess.cc INFO: Created files CPPProcess.h and CPPProcess.cc in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/SubProcesses/P1_Sigma_sm_gg_ttxggg/. -Generated helas calls for 1 subprocesses (1240 diagrams) in 6.716 s +Generated helas calls for 1 subprocesses (1240 diagrams) in 6.712 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -183,7 +183,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.352 s +ALOHA: aloha creates 5 routines in 0.345 s VVV1 VVV1 FFV1 @@ -206,7 +206,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gg_ttggg/src/. quit -real 0m13.290s -user 0m13.123s -sys 0m0.115s +real 0m13.123s +user 0m12.875s +sys 0m0.149s Code generation completed in 13 seconds diff --git a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT +++ b/epochX/cudacpp/gg_ttggg.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc index c2f8607428..ff20b7ba63 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g g WEIGHTED<=5 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -31908,7 +31909,7 @@ namespace mg5amcCpu { -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, 514, 505, -62, 496, -71, 568, -44, -134, -53, -62, 19, -71, 10, -80, 100, 10, 640, -80, 1, -8, 10, 1, 64, -8, -62, -71, 10, -80, 1, -8, 28, -62, -62, 10, 10, 1, 1, -8, 10, 1, 64, -8, -8, 64, -80, -8, -512, 64, 496, 568, -80, 640, -8, 64, -224, 496, 496, -80, -80, -8, -8, 64, -80, -8, -512, 64, 64, -512, 640, 64, 4096, -512 }, { 136, -116, -116, -44, -44, 514, -116, 442, -44, 28, -53, -62, -44, -53, 514, -62, 100, 10, 28, -62, -62, 10, 10, 1, -116, 442, 442, -134, -134, 505, -44, -134, 28, -224, -62, 496, -53, 19, -62, 496, 10, -80, -62, -71, 10, -80, 1, -8, -44, -134, -53, -62, 19, -71, 514, 505, -62, 496, -71, 568, 100, 10, 10, -80, -80, 640, 10, 1, 1, -8, -8, 64, 28, -62, -62, 10, 10, 1, -62, -71, 10, -80, 1, -8, 10, 1, 1, -8, -8, 64, -80, -8, -8, 64, 64, -512, -224, 496, 496, -80, -80, -8, 496, 568, -80, 640, -8, 64, -80, -8, -8, 64, 64, -512, 640, 64, 64, -512, -512, 4096 } }; // 2-D array[120][120] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -31965,7 +31966,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -32024,7 +32025,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -32183,8 +32184,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1, 1 }, { 1, 1, 1, -1, 1, 1, -1 }, { 1, 1, 1, -1, 1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -32227,9 +32228,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -32268,7 +32269,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -32297,6 +32298,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -32310,6 +32315,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -32333,12 +32340,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -32359,7 +32366,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -32485,9 +32492,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -32511,7 +32518,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -32531,7 +32538,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 1536 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -32545,9 +32552,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -32575,7 +32585,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -32785,7 +32795,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h index 2565923dde..fff95b66e2 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/P1_Sigma_sm_gg_ttxggg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gg_ttggg.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gg_ttggg.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h index b247654dcf..475749ca7c 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/rambo.h +++ b/epochX/cudacpp/gg_ttggg.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gg_ttggg.sa/src/read_slha.cc b/epochX/cudacpp/gg_ttggg.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gg_ttggg.sa/src/read_slha.cc +++ b/epochX/cudacpp/gg_ttggg.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt index 2c0e77fafd..cfbc521449 100644 --- a/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.mad/CODEGEN_mad_gq_ttq_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005677223205566406  +DEBUG: model prefixing takes 0.0053598880767822266  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.080 s +8 processes with 40 diagrams generated in 0.079 s Total: 8 processes with 40 diagrams output madevent ../TMPOUT/CODEGEN_mad_gq_ttq --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -198,7 +198,7 @@ INFO: Combined process g d~ > t t~ d~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ INFO: Combined process g s~ > t t~ s~ WEIGHTED<=3 @1 with process g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -215,7 +215,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -230,17 +230,17 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux -Generated helas calls for 2 subprocesses (10 diagrams) in 0.032 s -Wrote files for 32 helas calls in 0.231 s +Generated helas calls for 2 subprocesses (10 diagrams) in 0.043 s +Wrote files for 32 helas calls in 0.217 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.364 s +ALOHA: aloha creates 2 routines in 0.144 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 4 routines in 0.137 s +ALOHA: aloha creates 4 routines in 0.130 s FFV1 FFV1 FFV1 @@ -294,10 +294,10 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m2.934s -user 0m1.748s -sys 0m0.220s -Code generation completed in 3 seconds +real 0m1.946s +user 0m1.693s +sys 0m0.230s +Code generation completed in 2 seconds ************************************************************ * * * W E L C O M E to * @@ -323,7 +323,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -353,7 +353,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_gq_ttq/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gq_ttq.mad/COPYRIGHT +++ b/epochX/cudacpp/gq_ttq.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index 6242b019fa..4ece50575c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ d WEIGHTED<=3 @1 // Process: g s > t t~ s WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, -1, 1, 1, 1 }, { 1, -1, 1, -1, -1 }, { 1, -1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -630,6 +631,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -643,6 +648,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -666,12 +673,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +699,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +825,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +851,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +871,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +885,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +918,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1128,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index bf037c6c28..ce22572055 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 90788b2c75..fee492fbc1 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ d~ WEIGHTED<=3 @1 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, -1 }, { 1, 1, 1, -1, 1 }, { 1, 1, 1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -630,6 +631,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -643,6 +648,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -666,12 +673,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +699,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +825,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +851,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +871,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +885,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +918,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1128,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index 0f49f5247b..46c4347506 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gq_ttq.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/gq_ttq.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h index cd4e6de668..45000c7246 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc index c06dcbb252..8b92ea0bd6 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h index a6eb185434..a3615ec77a 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h index 80032e528b..6bde4466d0 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gq_ttq.mad/src/rambo.h b/epochX/cudacpp/gq_ttq.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/rambo.h +++ b/epochX/cudacpp/gq_ttq.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gq_ttq.mad/src/read_slha.cc b/epochX/cudacpp/gq_ttq.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gq_ttq.mad/src/read_slha.cc +++ b/epochX/cudacpp/gq_ttq.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt index f659f6bb8d..fe303ed372 100644 --- a/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt +++ b/epochX/cudacpp/gq_ttq.sa/CODEGEN_cudacpp_gq_ttq_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define q = u c d s u~ c~ d~ s~ INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.0054836273193359375  +DEBUG: model prefixing takes 0.0057065486907958984  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -170,7 +170,7 @@ INFO: Crossed process found for g u~ > t t~ u~, reuse diagrams. INFO: Crossed process found for g c~ > t t~ c~, reuse diagrams. INFO: Crossed process found for g d~ > t t~ d~, reuse diagrams. INFO: Crossed process found for g s~ > t t~ s~, reuse diagrams. -8 processes with 40 diagrams generated in 0.080 s +8 processes with 40 diagrams generated in 0.077 s Total: 8 processes with 40 diagrams output standalone_cudacpp ../TMPOUT/CODEGEN_cudacpp_gq_ttq Load PLUGIN.CUDACPP_OUTPUT @@ -211,7 +211,7 @@ Generated helas calls for 2 subprocesses (10 diagrams) in 0.031 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVV1 routines -ALOHA: aloha creates 2 routines in 0.146 s +ALOHA: aloha creates 2 routines in 0.141 s FFV1 FFV1 FFV1 @@ -227,7 +227,7 @@ INFO: Created files Parameters_sm.h and Parameters_sm.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_gq_ttq/src/. quit -real 0m0.709s -user 0m0.586s -sys 0m0.064s -Code generation completed in 0 seconds +real 0m0.652s +user 0m0.583s +sys 0m0.057s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/gq_ttq.sa/COPYRIGHT +++ b/epochX/cudacpp/gq_ttq.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc index 90e90b3aa9..497f35fa66 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ d WEIGHTED<=3 @1 // Process: g s > t t~ s WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -336,7 +337,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -393,7 +394,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -452,7 +453,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -515,8 +516,8 @@ namespace mg5amcCpu { 1, -1, 1, 1, 1 }, { 1, -1, 1, -1, -1 }, { 1, -1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -557,9 +558,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -596,7 +597,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -625,6 +626,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -638,6 +643,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -661,12 +668,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -687,7 +694,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -813,9 +820,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -839,7 +846,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -859,7 +866,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -873,9 +880,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -903,7 +913,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1113,7 +1123,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h index bf037c6c28..ce22572055 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gu_ttxu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc index 76c9403933..61269d2eac 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ d~ WEIGHTED<=3 @1 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -336,7 +337,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -393,7 +394,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -452,7 +453,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -515,8 +516,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, -1 }, { 1, 1, 1, -1, 1 }, { 1, 1, 1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -557,9 +558,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -596,7 +597,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -625,6 +626,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -638,6 +643,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -661,12 +668,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -687,7 +694,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -813,9 +820,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -839,7 +846,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -859,7 +866,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -873,9 +880,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -903,7 +913,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1113,7 +1123,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h index 0f49f5247b..46c4347506 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/P1_Sigma_sm_gux_ttxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/gq_ttq.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h index cd4e6de668..45000c7246 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc index c06dcbb252..8b92ea0bd6 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h index a6eb185434..a3615ec77a 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h +++ b/epochX/cudacpp/gq_ttq.sa/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -217,7 +217,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -236,7 +236,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -263,7 +263,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/gq_ttq.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h index b247654dcf..475749ca7c 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/gq_ttq.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/gq_ttq.sa/src/rambo.h b/epochX/cudacpp/gq_ttq.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/rambo.h +++ b/epochX/cudacpp/gq_ttq.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/gq_ttq.sa/src/read_slha.cc b/epochX/cudacpp/gq_ttq.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/gq_ttq.sa/src/read_slha.cc +++ b/epochX/cudacpp/gq_ttq.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt index 800492306f..1054438636 100644 --- a/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt +++ b/epochX/cudacpp/heft_gg_h.sa/CODEGEN_cudacpp_heft_gg_h_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -62,6 +62,12 @@ set auto_convert_model T save options auto_convert_model save configuration file to /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amcnlo/input/mg5_configuration.txt import model heft +INFO: reload from .py file +INFO: load particles +INFO: load vertices +WARNING: coupling GC_13=-(complex(0,1)*GH) has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  +WARNING: coupling GC_16=(complex(0,1)*Gphi)/8. has direct dependence in aS but has QCD order set to 0. Automatic computation of scale uncertainty can be wrong for such model.  +DEBUG: model prefixing takes 0.005749940872192383  INFO: Restrict model heft with file models/heft/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: s u w+ at order: QED=1  @@ -153,7 +159,7 @@ Generated helas calls for 1 subprocesses (1 diagrams) in 0.002 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVS3 routines -ALOHA: aloha creates 1 routines in 0.062 s +ALOHA: aloha creates 1 routines in 0.069 s VVS3 FileWriter for /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/./HelAmps_heft.h INFO: Created file HelAmps_heft.h in directory /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. @@ -165,7 +171,7 @@ INFO: Created files Parameters_heft.h and Parameters_heft.cc in directory INFO: /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. and /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_cudacpp_heft_gg_h/src/. quit -real 0m0.471s -user 0m0.367s -sys 0m0.052s -Code generation completed in 0 seconds +real 0m0.616s +user 0m0.387s +sys 0m0.049s +Code generation completed in 1 seconds diff --git a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT +++ b/epochX/cudacpp/heft_gg_h.sa/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h index d65c9d6e04..85c3c9ed1c 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h index 8109470148..78004e66cc 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_heft.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc index 526bd7d296..624791e8b3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_heft.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > h HIG<=1 HIW<=1 WEIGHTED<=2 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu //__device__ const fptype* cIPD = nullptr; // unused as nparam=0 __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //__device__ __constant__ fptype* cIPD = nullptr; // unused as nparam=0 __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -268,7 +269,7 @@ namespace mg5amcCpu // [NB do keep 'static' for these constexpr arrays, see issue #283] static constexpr fptype2 cf[ncolor][ncolor] = { { 2 } }; // 2-D array[1][1] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -325,7 +326,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -384,7 +385,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -419,8 +420,8 @@ namespace mg5amcCpu { -1, 1, 0 }, { 1, -1, 0 }, { 1, 1, 0 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -459,9 +460,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory //const fptype tIPD[0] = { ... }; // nparam=0 //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - //checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ) ); // nparam=0 - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + //gpuMemcpyToSymbol( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0 + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else //memcpy( cIPD, tIPD, 0 * sizeof( fptype ) ); // nparam=0 //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -495,7 +496,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -524,6 +525,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -537,6 +542,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -560,12 +567,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -586,7 +593,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -712,9 +719,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -738,7 +745,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -758,7 +765,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -772,9 +779,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -802,7 +812,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1012,7 +1022,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h index dbc5aa0e4e..e1caef360b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/P1_Sigma_heft_gg_h/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc index a1c3cdc238..688cb8167b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/heft_gg_h.sa/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h index eae9ff5242..dbff117235 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/HelAmps_heft.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc index e5442756b1..d3d6058b46 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h index 790485fee0..c2be5bba97 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/Parameters_heft.h @@ -28,7 +28,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -94,7 +94,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -230,7 +230,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -247,7 +247,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -298,7 +298,7 @@ namespace mg5amcCpu // End non-SM (e.g. EFT) implementation - special handling of vectors of floats (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk index 0bd815c9b3..fb8da8830b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk +++ b/epochX/cudacpp/heft_gg_h.sa/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_heft.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_heft_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h index b247654dcf..475749ca7c 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #undef MGONGPU_SUPPORTS_MULTICHANNEL +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/rambo.h +++ b/epochX/cudacpp/heft_gg_h.sa/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/heft_gg_h.sa/src/read_slha.cc b/epochX/cudacpp/heft_gg_h.sa/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/heft_gg_h.sa/src/read_slha.cc +++ b/epochX/cudacpp/heft_gg_h.sa/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt index ff161c336f..e01d29e02f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt +++ b/epochX/cudacpp/pp_tt012j.mad/CODEGEN_mad_pp_tt012j_log.txt @@ -52,7 +52,7 @@ Note that you can still compile and run aMC@NLO with the built-in PDFs Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt import /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j.mg The import format was not given, so we guess it as command set stdout_level DEBUG @@ -61,7 +61,7 @@ set zerowidth_tchannel F define j = p INFO: load particles INFO: load vertices -DEBUG: model prefixing takes 0.005424976348876953  +DEBUG: model prefixing takes 0.005494594573974609  INFO: Restrict model sm with file models/sm/restrict_default.dat . DEBUG: Simplifying conditional expressions  DEBUG: remove interactions: u s w+ at order: QED=1  @@ -172,7 +172,7 @@ INFO: Process u~ u > t t~ added to mirror process u u~ > t t~ INFO: Process c~ c > t t~ added to mirror process c c~ > t t~ INFO: Process d~ d > t t~ added to mirror process d d~ > t t~ INFO: Process s~ s > t t~ added to mirror process s s~ > t t~ -5 processes with 7 diagrams generated in 0.030 s +5 processes with 7 diagrams generated in 0.029 s Total: 5 processes with 7 diagrams add process p p > t t~ j @1 INFO: Checking for minimal orders which gives processes. @@ -212,7 +212,7 @@ INFO: Process d~ g > t t~ d~ added to mirror process g d~ > t t~ d~ INFO: Process d~ d > t t~ g added to mirror process d d~ > t t~ g INFO: Process s~ g > t t~ s~ added to mirror process g s~ > t t~ s~ INFO: Process s~ s > t t~ g added to mirror process s s~ > t t~ g -13 processes with 76 diagrams generated in 0.139 s +13 processes with 76 diagrams generated in 0.136 s Total: 18 processes with 83 diagrams add process p p > t t~ j j @2 INFO: Checking for minimal orders which gives processes. @@ -378,7 +378,7 @@ INFO: Process s~ u~ > t t~ u~ s~ added to mirror process u~ s~ > t t~ u~ s~ INFO: Process s~ c~ > t t~ c~ s~ added to mirror process c~ s~ > t t~ c~ s~ INFO: Process s~ d~ > t t~ d~ s~ added to mirror process d~ s~ > t t~ d~ s~ INFO: Crossed process found for s~ s~ > t t~ s~ s~, reuse diagrams. -65 processes with 1119 diagrams generated in 1.876 s +65 processes with 1119 diagrams generated in 1.826 s Total: 83 processes with 1202 diagrams output madevent ../TMPOUT/CODEGEN_mad_pp_tt012j --hel_recycling=False --vector_size=32 --me_exporter=standalone_cudacpp Load PLUGIN.CUDACPP_OUTPUT @@ -497,7 +497,7 @@ INFO: Combined process d d~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED INFO: Combined process s s~ > t t~ WEIGHTED<=2 with process u u~ > t t~ WEIGHTED<=2 INFO: Creating files in directory P2_gg_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -514,7 +514,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxgg INFO: Creating files in directory P2_gg_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -531,7 +531,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gg_ttxuux INFO: Creating files in directory P2_gu_ttxgu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -548,7 +548,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ g u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gu_ttxgu INFO: Creating files in directory P2_gux_ttxgux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -565,7 +565,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ g u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group gux_ttxgux INFO: Creating files in directory P2_uux_ttxgg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -582,7 +582,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g g WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxgg INFO: Creating files in directory P1_gg_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -599,7 +599,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gg_ttxg INFO: Creating files in directory P2_uu_ttxuu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -616,7 +616,7 @@ INFO: Generating Feynman diagrams for Process: u u > t t~ u u WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uu_ttxuu INFO: Creating files in directory P2_uux_ttxuux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -633,7 +633,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ u u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxuux INFO: Creating files in directory P2_uxux_ttxuxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -650,7 +650,7 @@ INFO: Generating Feynman diagrams for Process: u~ u~ > t t~ u~ u~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxux_ttxuxux INFO: Creating files in directory P2_uc_ttxuc DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -667,7 +667,7 @@ INFO: Generating Feynman diagrams for Process: u c > t t~ u c WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uc_ttxuc INFO: Creating files in directory P2_uux_ttxccx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -684,7 +684,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ c c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uux_ttxccx INFO: Creating files in directory P2_ucx_ttxucx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -701,7 +701,7 @@ INFO: Generating Feynman diagrams for Process: u c~ > t t~ u c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group ucx_ttxucx INFO: Creating files in directory P2_uxcx_ttxuxcx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -718,7 +718,7 @@ INFO: Generating Feynman diagrams for Process: u~ c~ > t t~ u~ c~ WEIGHTED<=4 @2 INFO: Finding symmetric diagrams for subprocess group uxcx_ttxuxcx INFO: Creating files in directory P1_gu_ttxu DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -735,7 +735,7 @@ INFO: Generating Feynman diagrams for Process: g u > t t~ u WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gu_ttxu INFO: Creating files in directory P1_gux_ttxux DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -752,7 +752,7 @@ INFO: Generating Feynman diagrams for Process: g u~ > t t~ u~ WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group gux_ttxux INFO: Creating files in directory P1_uux_ttxg DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -769,7 +769,7 @@ INFO: Generating Feynman diagrams for Process: u u~ > t t~ g WEIGHTED<=3 @1 INFO: Finding symmetric diagrams for subprocess group uux_ttxg INFO: Creating files in directory P0_gg_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -786,7 +786,7 @@ INFO: Generating Feynman diagrams for Process: g g > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group gg_ttx INFO: Creating files in directory P0_uux_ttx DEBUG: kwargs[prefix] = 0 [model_handling.py at line 1058]  -DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  +DEBUG: process_exporter_cpp =  [export_v4.py at line 6262]  INFO: Creating files in directory . FileWriter for ././CPPProcess.h FileWriter for ././CPPProcess.cc @@ -801,15 +801,15 @@ INFO: Created files CPPProcess.h and CPPProcess.cc in directory ./. DEBUG: vector, subproc_group,self.opt['vector_size'] =  32 True 32 [export_v4.py at line 1872]  INFO: Generating Feynman diagrams for Process: u u~ > t t~ WEIGHTED<=2 INFO: Finding symmetric diagrams for subprocess group uux_ttx -Generated helas calls for 18 subprocesses (372 diagrams) in 1.312 s -Wrote files for 810 helas calls in 3.308 s +Generated helas calls for 18 subprocesses (372 diagrams) in 1.285 s +Wrote files for 810 helas calls in 3.231 s ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 5 routines in 0.342 s +ALOHA: aloha creates 5 routines in 0.334 s DEBUG: Entering PLUGIN_ProcessExporter.convert_model (create the model) [output.py at line 202]  ALOHA: aloha starts to compute helicity amplitudes ALOHA: aloha creates VVV1 routines @@ -817,7 +817,7 @@ ALOHA: aloha creates FFV1 routines ALOHA: aloha creates VVVV1 routines ALOHA: aloha creates VVVV3 routines ALOHA: aloha creates VVVV4 routines -ALOHA: aloha creates 10 routines in 0.321 s +ALOHA: aloha creates 10 routines in 0.314 s VVV1 VVV1 FFV1 @@ -1028,9 +1028,9 @@ Type "launch" to generate events from this process, or see Run "open index.html" to see more information about this process. quit -real 0m9.073s -user 0m8.514s -sys 0m0.464s +real 0m8.840s +user 0m8.307s +sys 0m0.499s Code generation completed in 9 seconds ************************************************************ * * @@ -1057,7 +1057,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards run quit INFO: @@ -1087,7 +1087,7 @@ INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/mg5amc INFO: load configuration from /data/avalassi/GPU2023/madgraph4gpuX/MG5aMC/TMPOUT/CODEGEN_mad_pp_tt012j/Cards/me5_configuration.txt Using default text editor "vi". Set another one in ./input/mg5_configuration.txt Using default eps viewer "evince". Set another one in ./input/mg5_configuration.txt -No valid web browser found. Please set in ./input/mg5_configuration.txt +Using default web browser "firefox". Set another one in ./input/mg5_configuration.txt treatcards param quit INFO: diff --git a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT index a134b5fef9..9036d9260a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT +++ b/epochX/cudacpp/pp_tt012j.mad/COPYRIGHT @@ -15,6 +15,7 @@ The full development team currently includes the following authors : Stephan Hageboeck (CERN) Olivier Mattelaer (Universite Catholique de Louvain, original author) Stefan Roiser (CERN, original author) + Jorgen Teig (CERN) Andrea Valassi (CERN, original author) Zenny Wettersten (CERN) See https://github.com/madgraph5/madgraph4gpu for more details. For the full diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h index bf8b5e024d..f9ed70dfde 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/Bridge.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Nov 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGE_H #define BRIDGE_H 1 @@ -14,16 +14,23 @@ #include "MemoryAccessMomenta.h" // for MemoryAccessMomenta::neppM #include "MemoryBuffers.h" // for HostBufferMomenta, DeviceBufferMomenta etc +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif + +#include // bypass std::filesystem #803 + #include #include #include #include -#include #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +90,7 @@ namespace mg5amcCpu Bridge& operator=( const Bridge& ) = delete; Bridge& operator=( Bridge&& ) = delete; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL /** * Set the gpublocks and gputhreads for the gpusequence - throws if evnt != gpublocks*gputhreads * (this is needed for BridgeKernel tests rather than for actual production use in Fortran) @@ -150,7 +157,7 @@ namespace mg5amcCpu unsigned int m_nevt; // number of events int m_nGoodHel; // the number of good helicities (-1 initially when they have not yet been calculated) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL int m_gputhreads; // number of gpu threads (default set from number of events, can be modified) int m_gpublocks; // number of gpu blocks (default set from number of events, can be modified) DeviceBuffer m_devMomentaF; @@ -187,12 +194,12 @@ namespace mg5amcCpu // Forward declare transposition methods // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL template void hst_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ); @@ -209,7 +216,7 @@ namespace mg5amcCpu Bridge::Bridge( unsigned int nevtF, unsigned int nparF, unsigned int np4F ) : m_nevt( nevtF ) , m_nGoodHel( -1 ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL , m_gputhreads( 256 ) // default number of gpu threads , m_gpublocks( m_nevt / m_gputhreads ) // this ensures m_nevt <= m_gpublocks*m_gputhreads , m_devMomentaF( m_nevt ) @@ -233,7 +240,7 @@ namespace mg5amcCpu { if( nparF != CPPProcess::npar ) throw std::runtime_error( "Bridge constructor: npar mismatch" ); if( np4F != CPPProcess::np4 ) throw std::runtime_error( "Bridge constructor: np4 mismatch" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( ( m_nevt < s_gputhreadsmin ) || ( m_nevt % s_gputhreadsmin != 0 ) ) throw std::runtime_error( "Bridge constructor: nevt should be a multiple of " + std::to_string( s_gputhreadsmin ) ); while( m_nevt != m_gpublocks * m_gputhreads ) @@ -249,20 +256,28 @@ namespace mg5amcCpu #else std::cout << "WARNING! Instantiate host Bridge (nevt=" << m_nevt << ")" << std::endl; m_pmek.reset( new MatrixElementKernelHost( m_hstMomentaC, m_hstGs, m_hstRndHel, m_hstRndCol, m_hstMEs, m_hstSelHel, m_hstSelCol, m_nevt ) ); -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL // Create a process object, read param card and set parameters // FIXME: the process instance can happily go out of scope because it is only needed to read parameters? // FIXME: the CPPProcess should really be a singleton? what if fbridgecreate is called from several Fortran threads? CPPProcess process( /*verbose=*/false ); std::string paramCard = "../../Cards/param_card.dat"; - if( !std::filesystem::exists( paramCard ) ) - { - paramCard = "../" + paramCard; - } + /* +#ifdef __HIPCC__ + if( !std::experimental::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#else + if( !std::filesystem::exists( paramCard ) ) paramCard = "../" + paramCard; +#endif + */ + //struct stat dummybuffer; // bypass std::filesystem #803 + //if( !( stat( paramCard.c_str(), &dummyBuffer ) == 0 ) ) paramCard = "../" + paramCard; // + auto fileExists = []( std::string& fileName ) + { struct stat buffer; return stat( fileName.c_str(), &buffer ) == 0; }; + if( !fileExists( paramCard ) ) paramCard = "../" + paramCard; // bypass std::filesystem #803 process.initProc( paramCard ); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::set_gpugrid( const int gpublocks, const int gputhreads ) { @@ -276,7 +291,7 @@ namespace mg5amcCpu } #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void Bridge::gpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -291,14 +306,14 @@ namespace mg5amcCpu constexpr int neppM = MemoryAccessMomenta::neppM; if constexpr( neppM == 1 && std::is_same_v ) { - checkCuda( cudaMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaC.data(), momenta, m_devMomentaC.bytes(), gpuMemcpyHostToDevice ); } else { - checkCuda( cudaMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( m_devMomentaF.data(), momenta, m_devMomentaF.bytes(), gpuMemcpyHostToDevice ); const int thrPerEvt = CPPProcess::npar * CPPProcess::np4; // AV: transpose alg does 1 element per thread (NOT 1 event per thread) //const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower - dev_transposeMomentaF2C<<>>( m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); + gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt ); } if constexpr( std::is_same_v ) { @@ -341,7 +356,7 @@ namespace mg5amcCpu } #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL template void Bridge::cpu_sequence( const FORTRANFPTYPE* momenta, const FORTRANFPTYPE* gs, @@ -396,7 +411,7 @@ namespace mg5amcCpu // - C++ array: momenta[npagM][npar][np4][neppM] with nevt=npagM*neppM (AOSOA) // -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template __global__ void dev_transposeMomentaF2C( const Tin* in, Tout* out, const unsigned int nevt ) { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc index d58066c9c1..eaf4037a24 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.cc @@ -1,17 +1,18 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "BridgeKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMomenta.h" #include //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -45,7 +46,7 @@ namespace mg5amcCpu //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -96,7 +97,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h index 15eb4bff4d..3efef8ce97 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/BridgeKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef BRIDGEKERNELS_H #define BRIDGEKERNELS_H 1 @@ -12,7 +12,7 @@ #include "MatrixElementKernels.h" #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -49,7 +49,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a CPU host class BridgeKernelHost final : public BridgeKernelBase { @@ -89,7 +89,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A Bridge wrapper class encapsulating matrix element calculations on a GPU device class BridgeKernelDevice : public BridgeKernelBase { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc index 985b39f576..010bc4cbd0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CommonRandomNumberKernel.cc @@ -1,15 +1,16 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "CommonRandomNumbers.h" +#include "GpuAbstraction.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc index 0b355a3c8d..c15b39844d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.cc @@ -1,10 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "CrossSectionKernels.h" +#include "GpuAbstraction.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessWeights.h" #include "MemoryBuffers.h" @@ -77,7 +78,7 @@ debug_me_is_abnormal( const fptype& me, size_t ievtALL ) //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -185,7 +186,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h index 7933ca4bbf..4d9659e04e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CrossSectionKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef CROSSSECTIONKERNELS_H #define CROSSSECTIONKERNELS_H 1 @@ -13,7 +13,7 @@ //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -96,7 +96,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating the calculation of event statistics on a GPU device class CrossSectionKernelDevice : public CrossSectionKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h deleted file mode 100644 index 64ce52f4b3..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CudaRuntime.h +++ /dev/null @@ -1,85 +0,0 @@ -// Copyright (C) 2020-2023 CERN and UCLouvain. -// Licensed under the GNU Lesser General Public License (version 3 or later). -// Created by: S. Roiser (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. - -#ifndef MG5AMC_CUDARUNTIME_H -#define MG5AMC_CUDARUNTIME_H 1 - -// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API -// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api - -#include -#include - -//-------------------------------------------------------------------------- - -// See https://stackoverflow.com/a/14038590 -#ifdef __CUDACC__ /* clang-format off */ -#define checkCuda( code ) { assertCuda( code, __FILE__, __LINE__ ); } -inline void assertCuda( cudaError_t code, const char* file, int line, bool abort = true ) -{ - if( code != cudaSuccess ) - { - printf( "ERROR! assertCuda: '%s' (%d) in %s:%d\n", cudaGetErrorString( code ), code, file, line ); - if( abort ) assert( code == cudaSuccess ); - } -} -#endif /* clang-format on */ - -//-------------------------------------------------------------------------- - -#ifdef __CUDACC__ -namespace mg5amcGpu -{ - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** - struct CudaRuntime final - { - CudaRuntime( const bool debug = true ) - : m_debug( debug ) { setUp( m_debug ); } - ~CudaRuntime() { tearDown( m_debug ); } - CudaRuntime( const CudaRuntime& ) = delete; - CudaRuntime( CudaRuntime&& ) = delete; - CudaRuntime& operator=( const CudaRuntime& ) = delete; - CudaRuntime& operator=( CudaRuntime&& ) = delete; - bool m_debug; - - // Set up CUDA application - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization - static void setUp( const bool debug = true ) - { - // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization - // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! - /* - // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] - // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization - // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ - if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; - checkCuda( cudaFree( 0 ) ); // SLOW! - */ - // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either - // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) - if( debug ) std::cout << "__CudaRuntime: calling cudaSetDevice(0)" << std::endl; - checkCuda( cudaSetDevice( 0 ) ); // SLOW! - } - - // Tear down CUDA application (call cudaDeviceReset) - // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** - // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck - // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking - static void tearDown( const bool debug = true ) - { - if( debug ) std::cout << "__CudaRuntime: calling cudaDeviceReset()" << std::endl; - checkCuda( cudaDeviceReset() ); - } - }; - -} -#endif - -//-------------------------------------------------------------------------- - -#endif // MG5AMC_CUDARUNTIME_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc index eb56333b03..08a16f6f2c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/CurandRandomNumberKernel.cc @@ -1,9 +1,9 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryBuffers.h" #include "RandomNumberKernels.h" @@ -22,7 +22,7 @@ inline void assertCurand( curandStatus_t code, const char *file, int line, bool } #endif /* clang-format on */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -36,7 +36,7 @@ namespace mg5amcCpu { if( m_isOnDevice ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !m_rnarray.isOnDevice() ) throw std::runtime_error( "CurandRandomNumberKernel on device with a host random number array" ); #else @@ -114,7 +114,7 @@ namespace mg5amcCpu /* printf( "\nCurandRandomNumberKernel::generateRnarray size = %d\n", (int)m_rnarray.size() ); fptype* data = m_rnarray.data(); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) { data = new fptype[m_rnarray.size()](); @@ -123,7 +123,7 @@ namespace mg5amcCpu #endif for( int i = 0; i < ( (int)m_rnarray.size() / 4 ); i++ ) printf( "[%4d] %f %f %f %f\n", i * 4, data[i * 4], data[i * 4 + 2], data[i * 4 + 2], data[i * 4 + 3] ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( m_rnarray.isOnDevice() ) delete[] data; #endif */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h index 48b51e0a49..b425a5bade 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/EventStatistics.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef EventStatistics_H #define EventStatistics_H 1 @@ -16,7 +16,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h new file mode 100644 index 0000000000..9c467b1e04 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuAbstraction.h @@ -0,0 +1,69 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jul 2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPUABSTRACTION_H +#define MG5AMC_GPUABSTRACTION_H 1 + +#include + +//-------------------------------------------------------------------------- + +#ifdef __CUDACC__ + +#define gpuError_t cudaError_t +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorString cudaGetErrorString +#define gpuSuccess cudaSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( cudaMallocHost( ptr, size ) ) +#define gpuMalloc( ptr, size ) checkGpu( cudaMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( cudaMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice +#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( cudaMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( cudaFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( cudaFreeHost( ptr ) ) + +#define gpuSetDevice cudaSetDevice +#define gpuDeviceSynchronize cudaDeviceSynchronize +#define gpuDeviceReset cudaDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#elif defined __HIPCC__ + +#define gpuError_t hipError_t +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorString hipGetErrorString +#define gpuSuccess hipSuccess + +#define gpuMallocHost( ptr, size ) checkGpu( hipHostMalloc( ptr, size ) ) // HostMalloc better +#define gpuMalloc( ptr, size ) checkGpu( hipMalloc( ptr, size ) ) + +#define gpuMemcpy( dstData, srcData, srcBytes, func ) checkGpu( hipMemcpy( dstData, srcData, srcBytes, func ) ) +#define gpuMemcpyHostToDevice hipMemcpyHostToDevice +#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost +#define gpuMemcpyToSymbol( type1, type2, size ) checkGpu( hipMemcpyToSymbol( type1, type2, size ) ) + +#define gpuFree( ptr ) checkGpu( hipFree( ptr ) ) +#define gpuFreeHost( ptr ) checkGpu( hipHostFree( ptr ) ) + +#define gpuSetDevice hipSetDevice +#define gpuDeviceSynchronize hipDeviceSynchronize +#define gpuDeviceReset hipDeviceReset + +#define gpuLaunchKernel( kernel, blocks, threads, ... ) kernel<<>>( __VA_ARGS__ ) +#define gpuLaunchKernelSharedMem( kernel, blocks, threads, sharedMem, ... ) kernel<<>>( __VA_ARGS__ ) + +//-------------------------------------------------------------------------- + +#endif + +#endif // MG5AMC_GPUABSTRACTION_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h new file mode 100644 index 0000000000..93579ef08b --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/GpuRuntime.h @@ -0,0 +1,85 @@ +// Copyright (C) 2020-2023 CERN and UCLouvain. +// Licensed under the GNU Lesser General Public License (version 3 or later). +// Created by: J. Teig (Jun 2023, based on earlier work by S. Roiser) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. + +#ifndef MG5AMC_GPURUNTIME_H +#define MG5AMC_GPURUNTIME_H 1 + +// MG5AMC on GPU uses the CUDA runtime API, not the lower level CUDA driver API +// See https://docs.nvidia.com/cuda/cuda-runtime-api/driver-vs-runtime-api.html#driver-vs-runtime-api + +#include "GpuAbstraction.h" + +#include + +//-------------------------------------------------------------------------- + +// See https://stackoverflow.com/a/14038590 +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ +#define checkGpu( code ) { assertGpu( code, __FILE__, __LINE__ ); } +inline void assertGpu( gpuError_t code, const char* file, int line, bool abort = true ) +{ + if( code != gpuSuccess ) + { + printf( "ERROR! assertGpu: '%s' (%d) in %s:%d\n", gpuGetErrorString( code ), code, file, line ); + if( abort ) assert( code == gpuSuccess ); + } +} +#endif /* clang-format on */ + +//-------------------------------------------------------------------------- + +#ifdef MGONGPUCPP_GPUIMPL +namespace mg5amcGpu +{ + // Instantiate a GpuRuntime at the beginnining of the application's main to + // invoke gpuSetDevice(0) in the constructor and book a gpuDeviceReset() call in the destructor + // *** FIXME! This will all need to be designed differently when going to multi-GPU nodes! *** + struct GpuRuntime final + { + GpuRuntime( const bool debug = true ) + : m_debug( debug ) { setUp( m_debug ); } + ~GpuRuntime() { tearDown( m_debug ); } + GpuRuntime( const GpuRuntime& ) = delete; + GpuRuntime( GpuRuntime&& ) = delete; + GpuRuntime& operator=( const GpuRuntime& ) = delete; + GpuRuntime& operator=( GpuRuntime&& ) = delete; + bool m_debug; + + // Set up CUDA application + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaSetDevice on startup is useful to properly book-keep the time spent in CUDA initialization + static void setUp( const bool debug = true ) + { + // ** NB: it is useful to call cudaSetDevice, or cudaFree, to properly book-keep the time spent in CUDA initialization + // ** NB: otherwise, the first CUDA operation (eg a cudaMemcpyToSymbol in CPPProcess ctor) appears to take much longer! + /* + // [We initially added cudaFree(0) to "ease profile analysis" only because it shows up as a big recognizable block!] + // No explicit initialization is needed: https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#initialization + // It is not clear what cudaFree(0) does at all: https://stackoverflow.com/questions/69967813/ + if ( debug ) std::cout << "__CudaRuntime: calling cudaFree(0)" << std::endl; + checkCuda( cudaFree( 0 ) ); // SLOW! + */ + // Replace cudaFree(0) by cudaSetDevice(0), even if it is not really needed either + // (but see https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs) + if( debug ) std::cout << "__GpuRuntime: calling GpuSetDevice(0)" << std::endl; + checkGpu( gpuSetDevice( 0 ) ); // SLOW! + } + + // Tear down CUDA application (call cudaDeviceReset) + // ** NB: strictly speaking this is not needed when using the CUDA runtime API ** + // Calling cudaDeviceReset on shutdown is only needed for checking memory leaks in cuda-memcheck + // See https://docs.nvidia.com/cuda/cuda-memcheck/index.html#leak-checking + static void tearDown( const bool debug = true ) + { + if( debug ) std::cout << "__GpuRuntime: calling GpuDeviceReset()" << std::endl; + checkGpu( gpuDeviceReset() ); + } + }; +} +#endif + +//-------------------------------------------------------------------------- + +#endif // MG5AMC_GPURUNTIME_H diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h index ef40624c88..6054185300 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MadgraphTest.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Dec 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MADGRAPHTEST_H_ #define MADGRAPHTEST_H_ 1 @@ -14,7 +14,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include #include @@ -22,7 +26,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; @@ -201,7 +205,7 @@ class MadgraphTest : public testing::TestWithParam // Since we link both the CPU-only and GPU tests into the same executable, we prevent // a multiply defined symbol by only compiling this in the non-CUDA phase: -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL /// Compare momenta and matrix elements. /// This uses an implementation of TestDriverBase to run a madgraph workflow, @@ -219,7 +223,14 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) const char* dumpEventsC = getenv( "CUDACPP_RUNTEST_DUMPEVENTS" ); const bool dumpEvents = ( dumpEventsC != 0 ) && ( std::string( dumpEventsC ) != "" ); const std::string refFileName = testDriver->getRefFileName(); + /* +#ifdef __HIPCC__ + const std::string dumpFileName = std::experimental::filesystem::path( refFileName ).filename(); +#else const std::string dumpFileName = std::filesystem::path( refFileName ).filename(); +#endif + */ + const std::string dumpFileName = refFileName; // bypass std::filesystem #803 std::ofstream dumpFile; if( dumpEvents ) { @@ -307,6 +318,6 @@ TEST_P( MadgraphTest, CompareMomentaAndME ) } } -#endif // __CUDACC__ +#endif // MGONGPUCPP_GPUIMPL #endif /* MADGRAPHTEST_H_ */ diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc index 74b5239ebf..81699dfea9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.cc @@ -1,12 +1,12 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "MatrixElementKernels.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" // Includes the abstraction for Nvidia/AMD compilation #include "MemoryAccessMomenta.h" #include "MemoryBuffers.h" @@ -14,7 +14,7 @@ //============================================================================ -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu { @@ -150,7 +150,7 @@ namespace mg5amcCpu //============================================================================ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu { @@ -209,13 +209,13 @@ namespace mg5amcGpu PinnedHostBufferHelicityMask hstIsGoodHel( ncomb ); DeviceBufferHelicityMask devIsGoodHel( ncomb ); // ... 0d1. Compute good helicity mask on the device - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), m_numerators.data(), m_denominators.data(), devIsGoodHel.data() ); #else - sigmaKin_getGoodHel<<>>( m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); + gpuLaunchKernel( sigmaKin_getGoodHel, m_gpublocks, m_gputhreads, m_momenta.data(), m_couplings.data(), m_matrixElements.data(), devIsGoodHel.data() ); #endif - checkCuda( cudaPeekAtLastError() ); + checkGpu( gpuPeekAtLastError() ); // ... 0d2. Copy back good helicity mask to the host copyHostFromDevice( hstIsGoodHel, devIsGoodHel ); // ... 0d3. Copy back good helicity list to constant memory on the device @@ -226,19 +226,19 @@ namespace mg5amcGpu void MatrixElementKernelDevice::computeMatrixElements( const unsigned int channelId ) { - computeDependentCouplings<<>>( m_gs.data(), m_couplings.data() ); + gpuLaunchKernel( computeDependentCouplings, m_gpublocks, m_gputhreads, m_gs.data(), m_couplings.data() ); #ifndef MGONGPU_NSIGHT_DEBUG constexpr unsigned int sharedMemSize = 0; #else constexpr unsigned int sharedMemSize = ntpbMAX * sizeof( float ); #endif #ifdef MGONGPU_SUPPORTS_MULTICHANNEL - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), channelId, m_numerators.data(), m_denominators.data(), m_selhel.data(), m_selcol.data() ); #else - sigmaKin<<>>( m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); + gpuLaunchKernelSharedMem( sigmaKin, m_gpublocks, m_gputhreads, sharedMemSize, m_momenta.data(), m_couplings.data(), m_rndhel.data(), m_rndcol.data(), m_matrixElements.data(), m_selhel.data(), m_selcol.data() ); #endif - checkCuda( cudaPeekAtLastError() ); - checkCuda( cudaDeviceSynchronize() ); + checkGpu( gpuPeekAtLastError() ); + checkGpu( gpuDeviceSynchronize() ); } //-------------------------------------------------------------------------- diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h index 23e84757a2..72bd8f195b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MatrixElementKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MATRIXELEMENTKERNELS_H #define MATRIXELEMENTKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -81,7 +81,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a CPU host class MatrixElementKernelHost final : public MatrixElementKernelBase, public NumberOfEvents { @@ -130,7 +130,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating matrix element calculations on a GPU device class MatrixElementKernelDevice : public MatrixElementKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h index 573b3bbbc9..ffb76e93de 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessAmplitudes.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_AMPLITUDES 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h index 35a3af42e0..3afdf3e554 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplings.h @@ -15,7 +15,7 @@ #include "MemoryBuffers.h" // for HostBufferCouplings::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h index dc0d93afff..ffcdf4dbef 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessCouplingsFixed.h @@ -14,7 +14,7 @@ //#include "MemoryAccessHelpers.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h index 3bce635718..66f2d32a6b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessDenominators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h index 31311aa375..4c726b30f3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessGs.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h index c82a6c7635..db73e4e064 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessHelpers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessHelpers_H #define MemoryAccessHelpers_H 1 @@ -105,7 +105,7 @@ class KernelAccessHelper : public MemoryAccessHelper } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid //printf( "kernelAccessRecord: ievt=%d threadId=%d\n", ievt, threadIdx.x ); return T::ieventAccessRecord( buffer, ievt ); // NB fptype and fptype_sv coincide for CUDA diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h index f32e6fea5b..3741011971 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMatrixElements.h @@ -13,7 +13,7 @@ #include "MemoryBuffers.h" // for HostBufferMatrixElements::isaligned // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h index 29266de32c..3be229d392 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessMomenta.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessMomenta_H #define MemoryAccessMomenta_H 1 @@ -13,7 +13,7 @@ #include "MemoryAccessVectors.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -30,7 +30,7 @@ namespace mg5amcCpu // Number of Events Per Page in the momenta AOSOA memory buffer layout // (these are all best kept as a compile-time constants: see issue #23) -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ // ----------------------------------------------------------------------------------------------- // --- GPUs: neppM is best set to a power of 2 times the number of fptype's in a 32-byte cacheline // --- This is relevant to ensure coalesced access to momenta in global memory diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h index b152183b28..18991f4fa6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessNumerators.h @@ -10,7 +10,7 @@ #include "MemoryAccessGs.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h index e2988d39f3..40cb089135 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessRandomNumbers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessRandomNumbers_H #define MemoryAccessRandomNumbers_H 1 @@ -11,7 +11,7 @@ #include "CPPProcess.h" #include "MemoryAccessHelpers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using mg5amcGpu::CPPProcess; #else using mg5amcCpu::CPPProcess; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h index e9b197368e..08faccff0f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessVectors.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryAccessVectors_H #define MemoryAccessVectors_H 1 @@ -10,7 +10,7 @@ #include "mgOnGpuVectors.h" -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL namespace mg5amcCpu // this is only needed for CPU SIMD vectorization { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h index 5428aaf933..33bef4559e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryAccessWavefunctions.h @@ -15,7 +15,7 @@ #define MGONGPU_TRIVIAL_WAVEFUNCTIONS 1 // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h index 3093e6ed18..7756a71621 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/MemoryBuffers.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021, based on earlier work by S. Hageboeck) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef MemoryBuffers_H #define MemoryBuffers_H 1 @@ -11,12 +11,12 @@ #include "mgOnGpuCxtypes.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "Parameters_sm.h" #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -87,7 +87,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr bool HostBufferALIGNED = false; // ismisaligned=false constexpr bool HostBufferMISALIGNED = true; // ismisaligned=true @@ -119,7 +119,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer template class PinnedHostBufferBase : public BufferBase @@ -128,18 +128,18 @@ namespace mg5amcCpu PinnedHostBufferBase( const size_t size ) : BufferBase( size, false ) { - checkCuda( cudaMallocHost( &( this->m_data ), this->bytes() ) ); + gpuMallocHost( &( this->m_data ), this->bytes() ); } virtual ~PinnedHostBufferBase() { - checkCuda( cudaFreeHost( this->m_data ) ); + gpuFreeHost( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer template class DeviceBufferBase : public BufferBase @@ -148,18 +148,18 @@ namespace mg5amcCpu DeviceBufferBase( const size_t size ) : BufferBase( size, true ) { - checkCuda( cudaMalloc( &( this->m_data ), this->bytes() ) ); + gpuMalloc( &( this->m_data ), this->bytes() ); } virtual ~DeviceBufferBase() { - checkCuda( cudaFree( this->m_data ) ); + gpuFree( this->m_data ); } }; #endif //-------------------------------------------------------------------------- -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for a given number of events template class HostBuffer : public HostBufferBase, virtual private NumberOfEvents @@ -175,7 +175,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA pinned host buffer for a given number of events template class PinnedHostBuffer : public PinnedHostBufferBase, virtual private NumberOfEvents @@ -191,7 +191,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating a CUDA device buffer for a given number of events template class DeviceBuffer : public DeviceBufferBase, virtual private NumberOfEvents @@ -213,7 +213,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta random numbers constexpr size_t sizePerEventRndNumMomenta = MemoryBuffers::np4 * MemoryBuffers::nparf; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta random numbers typedef HostBuffer HostBufferRndNumMomenta; #else @@ -232,7 +232,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer with ONE fptype per event constexpr size_t sizePerEventOneFp = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer with ONE fptype per event typedef HostBuffer HostBufferOneFp; #else @@ -257,7 +257,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for Gs constexpr size_t sizePerEventGs = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferGs; #else @@ -276,7 +276,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for numerators constexpr size_t sizePerEventNumerators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferNumerators; #else @@ -296,7 +296,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for denominators constexpr size_t sizePerEventDenominators = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferDenominators; #else @@ -315,7 +315,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for random numbers constexpr size_t sizePerEventCouplings = MemoryBuffers::ndcoup * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for gs typedef HostBuffer HostBufferCouplings; #else @@ -333,7 +333,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for momenta constexpr size_t sizePerEventMomenta = MemoryBuffers::np4 * MemoryBuffers::npar; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for momenta typedef HostBuffer HostBufferMomenta; //typedef HostBuffer HostBufferMomenta; // TEST MISALIGNMENT! @@ -352,7 +352,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for sampling weights constexpr size_t sizePerEventWeights = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for sampling weights typedef HostBuffer HostBufferWeights; #else @@ -370,7 +370,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for matrix elements constexpr size_t sizePerEventMatrixElements = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for matrix elements typedef HostBuffer HostBufferMatrixElements; #else @@ -385,7 +385,7 @@ namespace mg5amcCpu // A base class encapsulating a memory buffer for the helicity mask typedef BufferBase BufferHelicityMask; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for the helicity mask typedef HostBufferBase HostBufferHelicityMask; #else @@ -403,7 +403,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for wavefunctions constexpr size_t sizePerEventWavefunctions = MemoryBuffers::nw6 * MemoryBuffers::nx2; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for wavefunctions typedef HostBuffer HostBufferWavefunctions; #else @@ -421,7 +421,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity random numbers constexpr size_t sizePerEventRndNumHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity random numbers typedef HostBuffer HostBufferRndNumHelicity; #else @@ -439,7 +439,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color random numbers constexpr size_t sizePerEventRndNumColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color random numbers typedef HostBuffer HostBufferRndNumColor; #else @@ -457,7 +457,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for helicity selection constexpr size_t sizePerEventSelectedHelicity = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for helicity selection typedef HostBuffer HostBufferSelectedHelicity; #else @@ -475,7 +475,7 @@ namespace mg5amcCpu // The size (number of elements) per event in a memory buffer for color selection constexpr size_t sizePerEventSelectedColor = 1; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // A class encapsulating a C++ host buffer for color selection typedef HostBuffer HostBufferSelectedColor; #else @@ -487,7 +487,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyDeviceFromHost( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -504,13 +504,13 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyHostToDevice ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyHostToDevice ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL template void copyHostFromDevice( Tdst& dst, const Tsrc& src ) // keep the same order of arguments as in memcpy { @@ -527,7 +527,7 @@ namespace mg5amcCpu throw std::runtime_error( sstr.str() ); } // NB (PR #45): cudaMemcpy involves an intermediate memcpy to pinned memory if host array is a not a pinned host array - checkCuda( cudaMemcpy( dst.data(), src.data(), src.bytes(), cudaMemcpyDeviceToHost ) ); + gpuMemcpy( dst.data(), src.data(), src.bytes(), gpuMemcpyDeviceToHost ); } #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc index 7f14b5e299..c465192676 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ WEIGHTED<=2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -302,7 +303,7 @@ namespace mg5amcCpu { 16, -2 }, { -2, 16 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -359,7 +360,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -418,7 +419,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -465,8 +466,8 @@ namespace mg5amcCpu { 1, 1, -1, -1 }, { 1, 1, 1, 1 }, { 1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -506,9 +507,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -544,7 +545,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -573,6 +574,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -586,6 +591,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -609,12 +616,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -635,7 +642,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -761,9 +768,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -787,7 +794,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -807,7 +814,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -821,9 +828,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -851,7 +861,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1061,7 +1071,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h index 448175be9d..f8a20b77fc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_gg_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc index 20496eaa70..85bdc6bf24 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ WEIGHTED<=2 // Process: s s~ > t t~ WEIGHTED<=2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -279,7 +280,7 @@ namespace mg5amcCpu { 9, 3 }, { 3, 9 } }; // 2-D array[2][2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -336,7 +337,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -395,7 +396,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -442,8 +443,8 @@ namespace mg5amcCpu { -1, 1, -1, -1 }, { -1, 1, 1, 1 }, { -1, 1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -483,9 +484,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -521,7 +522,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -550,6 +551,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -563,6 +568,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -586,12 +593,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -612,7 +619,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -738,9 +745,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -764,7 +771,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -784,7 +791,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -798,9 +805,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -828,7 +838,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1038,7 +1048,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h index e166fa1652..6498b91441 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P0_uux_ttx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc index afeebde3c6..2fa9b4f651 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -505,7 +506,7 @@ namespace mg5amcCpu { 1, -8, 10, 1, 64, -8 }, { 10, 1, 1, -8, -8, 64 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -562,7 +563,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -621,7 +622,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -684,8 +685,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, 1 }, { 1, 1, 1, -1, -1 }, { 1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -726,9 +727,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -765,7 +766,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -794,6 +795,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -807,6 +812,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -830,12 +837,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -856,7 +863,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -982,9 +989,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1008,7 +1015,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1028,7 +1035,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1042,9 +1049,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1072,7 +1082,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1282,7 +1292,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h index 37d6ebe981..11f562273e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gg_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc index b7e3475679..f505a0d8c0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ d WEIGHTED<=3 @1 // Process: g s > t t~ s WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, -1, 1, 1, 1 }, { 1, -1, 1, -1, -1 }, { 1, -1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -630,6 +631,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -643,6 +648,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -666,12 +673,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +699,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +825,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +851,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +871,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +885,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +918,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1128,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h index bf037c6c28..ce22572055 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gu_ttxu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc index 0f999663da..9cba1a6d8c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ d~ WEIGHTED<=3 @1 // Process: g s~ > t t~ s~ WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { 1, 1, 1, 1, -1 }, { 1, 1, 1, -1, 1 }, { 1, 1, 1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -630,6 +631,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -643,6 +648,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -666,12 +673,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +699,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +825,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +851,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +871,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +885,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +918,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1128,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h index 0f49f5247b..46c4347506 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_gux_ttxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc index 87830582d7..222171c5cc 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ g WEIGHTED<=3 @1 // Process: s s~ > t t~ g WEIGHTED<=3 @1 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -341,7 +342,7 @@ namespace mg5amcCpu { 4, 0, 12, 4 }, { 0, 4, 4, 12 } }; // 2-D array[4][4] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -398,7 +399,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -457,7 +458,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -520,8 +521,8 @@ namespace mg5amcCpu { -1, 1, 1, 1, 1 }, { -1, 1, 1, -1, -1 }, { -1, 1, 1, -1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -562,9 +563,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -601,7 +602,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -630,6 +631,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -643,6 +648,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -666,12 +673,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -692,7 +699,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -818,9 +825,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -844,7 +851,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -864,7 +871,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -878,9 +885,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -908,7 +918,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1118,7 +1128,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h index f8bdb38aee..fc7c0d8196 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P1_uux_ttxg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc index 9051b3108d..c374ce3189 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -46,7 +45,7 @@ // Class member functions for calculating the matrix elements for // Process: g g > t t~ g g WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -80,7 +79,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -90,7 +89,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -118,13 +117,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -151,7 +150,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -187,7 +186,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -200,8 +199,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -2417,7 +2418,7 @@ namespace mg5amcCpu { 62, 71, -10, 80, -1, 8, -28, 62, 62, -10, -10, -1, -1, 8, -10, -1, -64, 8, 8, -64, 80, 8, 512, -64 }, { -28, 62, 62, -10, -10, -1, 62, 71, -10, 80, -1, 8, -10, -1, -1, 8, 8, -64, 80, 8, 8, -64, -64, 512 } }; // 2-D array[24][24] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -2474,7 +2475,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -2533,7 +2534,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -2628,8 +2629,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -2671,9 +2672,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -2711,7 +2712,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -2740,6 +2741,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -2753,6 +2758,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -2776,12 +2783,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -2802,7 +2809,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -2928,9 +2935,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -2954,7 +2961,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -2974,7 +2981,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 512 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -2988,9 +2995,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -3018,7 +3028,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -3228,7 +3238,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h index 9f43559181..511b053c2a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -107,7 +107,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -120,7 +120,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -150,7 +150,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc index 866433ae8b..6e93814e8e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g g > t t~ d d~ WEIGHTED<=4 @2 // Process: g g > t t~ s s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { -2, 6, -6, -2, 16, 0, 0, -2, 16, 6, 48, 16 }, { 6, -2, -2, -6, 0, 16, -2, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, -1 }, { 1, 1, 1, -1, 1, 1 }, { 1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1135,6 +1136,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -1148,6 +1153,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -1171,12 +1178,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1204,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1330,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1356,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1376,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 256 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1390,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1423,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1633,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h index f26b60c5bb..c411623fc8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gg_ttxuux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc index 1be98364ee..ed55777a81 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d > t t~ g d WEIGHTED<=4 @2 // Process: g s > t t~ g s WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { 0, -2, -6, -2, -2, 6, 16, 0, 6, 16, 48, 16 }, { -2, 0, -2, -6, 6, -2, 0, 16, 16, 6, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { 1, -1, 1, -1, -1, 1 }, { 1, -1, 1, -1, 1, -1 }, { 1, -1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1135,6 +1136,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -1148,6 +1153,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -1171,12 +1178,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1204,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1330,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1356,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1376,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1390,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1423,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1633,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h index 853175b477..9c820a5ddb 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gu_ttxgu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc index dfb05016f5..1e9d03033f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: g d~ > t t~ g d~ WEIGHTED<=4 @2 // Process: g s~ > t t~ g s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { -2, 0, 0, 16, -2, -6, 6, -2, 16, 6, 48, 16 }, { 0, -2, 16, 0, -6, -2, -2, 6, 6, 16, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, -1, -1 }, { 1, 1, 1, -1, 1, 1 }, { 1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1135,6 +1136,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -1148,6 +1153,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -1171,12 +1178,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1204,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1330,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1356,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1376,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 96 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1390,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1423,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1633,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h index e60cb5b6d7..a5a285b22d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_gux_ttxgux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc index ecef3e57ca..75930b65cd 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -51,7 +50,7 @@ // Process: c s > t t~ c s WEIGHTED<=4 @2 // Process: d s > t t~ d s WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -85,7 +84,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -95,7 +94,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -123,13 +122,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -156,7 +155,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -192,7 +191,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -205,8 +204,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -387,7 +388,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -444,7 +445,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -503,7 +504,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -598,8 +599,8 @@ namespace mg5amcCpu { -1, -1, 1, -1, -1, 1 }, { -1, -1, 1, -1, 1, -1 }, { -1, -1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -641,9 +642,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -681,7 +682,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -710,6 +711,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -723,6 +728,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -746,12 +753,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -772,7 +779,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -898,9 +905,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -924,7 +931,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -944,7 +951,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -958,9 +965,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -988,7 +998,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1198,7 +1208,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h index 5329710b87..8c84687f8a 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -112,7 +112,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -125,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -155,7 +155,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uc_ttxuc/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc index e4f9dee3a2..b12263362e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -57,7 +56,7 @@ // Process: s c~ > t t~ s c~ WEIGHTED<=4 @2 // Process: s d~ > t t~ s d~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -91,7 +90,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -101,7 +100,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -129,13 +128,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -162,7 +161,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -198,7 +197,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -211,8 +210,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -393,7 +394,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -450,7 +451,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -509,7 +510,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -604,8 +605,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, -1 }, { -1, 1, 1, -1, 1, 1 }, { -1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -647,9 +648,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -687,7 +688,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -716,6 +717,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -729,6 +734,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -752,12 +759,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -778,7 +785,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -904,9 +911,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -930,7 +937,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -950,7 +957,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -964,9 +971,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -994,7 +1004,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1204,7 +1214,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h index 391789dc81..da747c3465 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -118,7 +118,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -131,7 +131,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -161,7 +161,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_ucx_ttxucx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc index 302d63e31d..d55c7270e6 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d > t t~ d d WEIGHTED<=4 @2 // Process: s s > t t~ s s WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -497,7 +498,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -554,7 +555,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -613,7 +614,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -708,8 +709,8 @@ namespace mg5amcCpu { -1, -1, 1, -1, -1, 1 }, { -1, -1, 1, -1, 1, -1 }, { -1, -1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -751,9 +752,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -791,7 +792,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -820,6 +821,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -833,6 +838,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -856,12 +863,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -882,7 +889,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1008,9 +1015,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1034,7 +1041,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1054,7 +1061,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1068,9 +1075,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1098,7 +1108,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1308,7 +1318,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h index 2d95f4b170..d8232ea652 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uu_ttxuu/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc index d0be5131af..ed38fae08c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -57,7 +56,7 @@ // Process: s s~ > t t~ c c~ WEIGHTED<=4 @2 // Process: s s~ > t t~ d d~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -91,7 +90,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -101,7 +100,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -129,13 +128,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -162,7 +161,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -198,7 +197,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -211,8 +210,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -393,7 +394,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -450,7 +451,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -509,7 +510,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -604,8 +605,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, -1 }, { -1, 1, 1, -1, 1, 1 }, { -1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -647,9 +648,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -687,7 +688,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -716,6 +717,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -729,6 +734,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -752,12 +759,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -778,7 +785,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -904,9 +911,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -930,7 +937,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -950,7 +957,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -964,9 +971,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -994,7 +1004,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1204,7 +1214,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h index 14490d782f..71fdc6e547 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -118,7 +118,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -131,7 +131,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -161,7 +161,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxccx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc index 3a2178d534..b127281504 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ g g WEIGHTED<=4 @2 // Process: s s~ > t t~ g g WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -812,7 +813,7 @@ namespace mg5amcCpu { 16, -2, 0, 0, 0, 0, -2, 16, 16, 6, 48, 16 }, { 0, 0, 16, -2, -2, 16, 0, 0, 6, 16, 16, 48 } }; // 2-D array[12][12] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -869,7 +870,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -928,7 +929,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -1023,8 +1024,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, 1 }, { -1, 1, 1, -1, 1, -1 }, { -1, 1, 1, -1, 1, 1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -1066,9 +1067,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -1106,7 +1107,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -1135,6 +1136,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -1148,6 +1153,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -1171,12 +1178,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -1197,7 +1204,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1323,9 +1330,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1349,7 +1356,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1369,7 +1376,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1383,9 +1390,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1413,7 +1423,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1623,7 +1633,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h index 1543c29649..e9a24f516d 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxgg/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc index 70fbbee59f..80d2682458 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d d~ > t t~ d d~ WEIGHTED<=4 @2 // Process: s s~ > t t~ s s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -497,7 +498,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -554,7 +555,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -613,7 +614,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -708,8 +709,8 @@ namespace mg5amcCpu { -1, 1, 1, -1, -1, -1 }, { -1, 1, 1, -1, 1, 1 }, { -1, 1, 1, -1, 1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -751,9 +752,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -791,7 +792,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -820,6 +821,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -833,6 +838,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -856,12 +863,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -882,7 +889,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1008,9 +1015,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1034,7 +1041,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1054,7 +1061,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1068,9 +1075,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1098,7 +1108,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1308,7 +1318,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h index 58cece5c62..d8d3d481ea 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uux_ttxuux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc index 7df13a2341..8dd445388c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -51,7 +50,7 @@ // Process: c~ s~ > t t~ c~ s~ WEIGHTED<=4 @2 // Process: d~ s~ > t t~ d~ s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -85,7 +84,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -95,7 +94,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -123,13 +122,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -156,7 +155,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -192,7 +191,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -205,8 +204,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -387,7 +388,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -444,7 +445,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -503,7 +504,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -598,8 +599,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -641,9 +642,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -681,7 +682,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -710,6 +711,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -723,6 +728,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -746,12 +753,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -772,7 +779,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -898,9 +905,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -924,7 +931,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -944,7 +951,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 36 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -958,9 +965,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -988,7 +998,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1198,7 +1208,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h index 6bd3135c3c..901c6dfcc9 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -112,7 +112,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -125,7 +125,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -155,7 +155,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxcx_ttxuxcx/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc index f464c27160..08e9ed321f 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi, Z. Wettersten (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -16,7 +16,6 @@ #include "mgOnGpuConfig.h" -#include "CudaRuntime.h" #include "HelAmps_sm.h" #include "MemoryAccessAmplitudes.h" #include "MemoryAccessCouplings.h" @@ -49,7 +48,7 @@ // Process: d~ d~ > t t~ d~ d~ WEIGHTED<=4 @2 // Process: s~ s~ > t t~ s~ s~ WEIGHTED<=4 @2 -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +82,7 @@ namespace mg5amcCpu __device__ const fptype cIPD[2] = { (fptype)Parameters_sm::mdl_MT, (fptype)Parameters_sm::mdl_WT }; __device__ const fptype* cIPC = nullptr; // unused as nicoup=0 #else -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ fptype cIPD[2]; __device__ __constant__ fptype* cIPC = nullptr; // unused as nicoup=0 #else @@ -93,7 +92,7 @@ namespace mg5amcCpu #endif // Helicity combinations (and filtering of "good" helicity combinations) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __device__ __constant__ short cHel[ncomb][npar]; __device__ __constant__ int cNGoodHel; __device__ __constant__ int cGoodHel[ncomb]; @@ -121,13 +120,13 @@ namespace mg5amcCpu fptype* allDenominators, // output: multichannel denominators[nevt], running_sum_over_helicities #endif fptype_sv* jamp2_sv // output: jamp2[nParity][ncolor][neppV] for color choice (nullptr if disabled) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int ievt00 // input: first event number in current C++ event page (for CUDA, ievt depends on threadid) #endif ) //ALWAYS_INLINE // attributes are not permitted in a function definition { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using M_ACCESS = DeviceAccessMomenta; // non-trivial access: buffer includes all events using E_ACCESS = DeviceAccessMatrixElements; // non-trivial access: buffer includes all events @@ -154,7 +153,7 @@ namespace mg5amcCpu #endif /* clang-format on */ mgDebug( 0, __FUNCTION__ ); //printf( "calculate_wavefunctions: ihel=%2d\n", ihel ); -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //printf( "calculate_wavefunctions: ievt00=%d\n", ievt00 ); #endif @@ -190,7 +189,7 @@ namespace mg5amcCpu #endif for( int iParity = 0; iParity < nParity; ++iParity ) { // START LOOP ON IPARITY -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL const int ievt0 = ievt00 + iParity * neppV; #endif constexpr size_t nxcoup = ndcoup + nicoup; // both dependent and independent couplings @@ -203,8 +202,10 @@ namespace mg5amcCpu allCOUPs[idcoup] = CD_ACCESS::idcoupAccessBufferConst( allcouplings, idcoup ); // dependent couplings, vary event-by-event for( size_t iicoup = 0; iicoup < nicoup; iicoup++ ) allCOUPs[ndcoup + iicoup] = CI_ACCESS::iicoupAccessBufferConst( cIPC, iicoup ); // independent couplings, fixed for all events +#ifdef MGONGPUCPP_GPUIMPL #ifdef __CUDACC__ #pragma nv_diagnostic pop +#endif // CUDA kernels take input/output buffers with momenta/MEs for all events const fptype* momenta = allmomenta; const fptype* COUPs[nxcoup]; @@ -497,7 +498,7 @@ namespace mg5amcCpu { 3, 9, 9, 3, 27, 9 }, { 9, 3, 3, 9, 9, 27 } }; // 2-D array[6][6] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Pre-compute a constexpr triangular color matrix properly normalized #475 struct TriangularNormalizedColorMatrix { @@ -554,7 +555,7 @@ namespace mg5amcCpu #endif for( int icol = 0; icol < ncolor; icol++ ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // === C++ START === // Diagonal terms #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT @@ -613,7 +614,7 @@ namespace mg5amcCpu MEs_sv_previous += deltaMEs_previous; #endif /* -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if ( cNGoodHel > 0 ) printf( "calculate_wavefunctions: ievt=%6d ihel=%2d me_running=%f\n", blockDim.x * blockIdx.x + threadIdx.x, ihel, MEs_sv ); #else #ifdef MGONGPU_CPPSIMD @@ -708,8 +709,8 @@ namespace mg5amcCpu { 1, 1, 1, -1, 1, -1 }, { 1, 1, 1, -1, -1, 1 }, { 1, 1, 1, -1, -1, -1 } }; -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cHel, tHel, ncomb * npar * sizeof( short ) ); #else memcpy( cHel, tHel, ncomb * npar * sizeof( short ) ); #endif @@ -751,9 +752,9 @@ namespace mg5amcCpu // Then copy them to CUDA constant memory (issue #39) or its C++ emulation in file-scope static memory const fptype tIPD[2] = { (fptype)m_pars->mdl_MT, (fptype)m_pars->mdl_WT }; //const cxtype tIPC[0] = { ... }; // nicoup=0 -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ) ); - //checkCuda( cudaMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ) ); // nicoup=0 +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cIPD, tIPD, 2 * sizeof( fptype ) ); + //gpuMemcpyToSymbol( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 #else memcpy( cIPD, tIPD, 2 * sizeof( fptype ) ); //memcpy( cIPC, tIPC, 0 * sizeof( cxtype ) ); // nicoup=0 @@ -791,7 +792,7 @@ namespace mg5amcCpu { std::stringstream out; // CUDA version (NVCC) - // [Use __NVCC__ instead of __CUDACC__ here!] + // [Use __NVCC__ instead of MGONGPUCPP_GPUIMPL here!] // [This tests if 'nvcc' was used even to build a .cc file, even if not necessarily 'nvcc -x cu' for a .cu file] // [Check 'nvcc --compiler-options -dM -E dummy.c | grep CUDA': see https://stackoverflow.com/a/53713712] #ifdef __NVCC__ @@ -820,6 +821,10 @@ namespace mg5amcCpu out << "Apple clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; #else out << "clang " << __clang_major__ << "." << __clang_minor__ << "." << __clang_patchlevel__; + /* + // === AV 26-Jan-2024 DISABLE THIS CODE (START) + // === AV 26-Jan-2024 First, it is totally wrong to assume that the CXX environment variable is used in the build! + // === AV 26-Jan-2024 Second and worse, here we need build time values, while CXX in this code is evaluated at runtime! // GCC toolchain version inside CLANG std::string tchainout; std::string tchaincmd = "readelf -p .comment $(${CXX} -print-libgcc-file-name) |& grep 'GCC: (GNU)' | grep -v Warning | sort -u | awk '{print $5}'"; @@ -833,6 +838,8 @@ namespace mg5amcCpu #else out << " (gcc " << tchainout << ")"; #endif + // === AV 26-Jan-2024 DISABLE THIS CODE (END) + */ #endif #else out << "clang UNKNOWKN"; @@ -856,12 +863,12 @@ namespace mg5amcCpu __global__ void /* clang-format off */ computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings // output: couplings[nevt*ndcoup*2] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; using G_ACCESS = DeviceAccessGs; using C_ACCESS = DeviceAccessCouplings; @@ -882,7 +889,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -1008,9 +1015,9 @@ namespace mg5amcCpu nGoodHel++; } } -#ifdef __CUDACC__ - checkCuda( cudaMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ) ); - checkCuda( cudaMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ) ); +#ifdef MGONGPUCPP_GPUIMPL + gpuMemcpyToSymbol( cNGoodHel, &nGoodHel, sizeof( int ) ); + gpuMemcpyToSymbol( cGoodHel, goodHel, ncomb * sizeof( int ) ); #else cNGoodHel = nGoodHel; for( int ihel = 0; ihel < ncomb; ihel++ ) cGoodHel[ihel] = goodHel[ihel]; @@ -1034,7 +1041,7 @@ namespace mg5amcCpu #endif int* allselhel, // output: helicity selection[nevt] int* allselcol // output: helicity selection[nevt] -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL , const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads) #endif ) /* clang-format on */ @@ -1054,7 +1061,7 @@ namespace mg5amcCpu // Denominators: spins, colors and identical particles constexpr int helcolDenominators[1] = { 72 }; // assume nprocesses == 1 (#272 and #343) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Remember: in CUDA this is a kernel for one event, in c++ this processes n events const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid #else @@ -1068,9 +1075,12 @@ namespace mg5amcCpu #endif // Start sigmaKin_lines + +#include "GpuAbstraction.h" + // === PART 0 - INITIALISATION (before calculate_wavefunctions) === // Reset the "matrix elements" - running sums of |M|^2 over helicities for the given event -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] = 0; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL allNumerators[ievt] = 0; @@ -1098,7 +1108,7 @@ namespace mg5amcCpu // === PART 1 - HELICITY LOOP: CALCULATE WAVEFUNCTIONS === // (in both CUDA and C++, using precomputed good helicities) -#ifdef __CUDACC__ // CUDA OR C++ +#ifdef MGONGPUCPP_GPUIMPL // CUDA OR C++ // *** START OF PART 1a - CUDA (one event per CPU thread) *** // Running sum of partial amplitudes squared for event by event color selection (#402) @@ -1308,7 +1318,7 @@ namespace mg5amcCpu // Get the final |M|^2 as an average over helicities/colors of the running sum of |M|^2 over helicities for the given event // [NB 'sum over final spins, average over initial spins', eg see // https://www.uzh.ch/cmsssl/physik/dam/jcr:2e24b7b1-f4d7-4160-817e-47b13dbf1d7c/Handout_4_2016-UZH.pdf] -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL allMEs[ievt] /= helcolDenominators[0]; #ifdef MGONGPU_SUPPORTS_MULTICHANNEL if( channelId > 0 ) allMEs[ievt] *= allNumerators[ievt] / allDenominators[ievt]; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h index 4e53fa1250..c2ca443c0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CPPProcess.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -25,7 +25,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -110,7 +110,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void computeDependentCouplings( const fptype* allgs, // input: Gs[nevt] fptype* allcouplings ); // output: couplings[nevt*ndcoup*2] @@ -123,7 +123,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] @@ -153,7 +153,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ __global__ void sigmaKin( const fptype* allmomenta, // input: momenta[nevt*npar*4] const fptype* allcouplings, // input: couplings[nevt*ndcoup*2] diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h deleted file mode 120000 index ce9e1a487a..0000000000 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/CudaRuntime.h +++ /dev/null @@ -1 +0,0 @@ -../CudaRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h new file mode 120000 index 0000000000..72054e19ba --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuAbstraction.h @@ -0,0 +1 @@ +../GpuAbstraction.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h new file mode 120000 index 0000000000..3920e83be4 --- /dev/null +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/GpuRuntime.h @@ -0,0 +1 @@ +../GpuRuntime.h \ No newline at end of file diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc index 3fbf0ffbee..bde384c69e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/P2_uxux_ttxuxux/check_sa.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: O. Mattelaer (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -12,6 +12,7 @@ #include "BridgeKernels.h" #include "CPPProcess.h" #include "CrossSectionKernels.h" +#include "GpuRuntime.h" #include "MatrixElementKernels.h" #include "MemoryAccessMatrixElements.h" #include "MemoryAccessMomenta.h" @@ -65,7 +66,7 @@ usage( char* argv0, int ret = 1 ) std::cout << std::endl; std::cout << "Summary stats are always computed: '-p' and '-j' only control their printout" << std::endl; std::cout << "The '-d' flag only enables NaN/abnormal warnings and OMP debugging" << std::endl; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP std::cout << std::endl; std::cout << "Use the OMP_NUM_THREADS environment variable to control OMP multi-threading" << std::endl; @@ -75,7 +76,7 @@ usage( char* argv0, int ret = 1 ) return ret; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +84,7 @@ namespace mg5amcCpu { inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU)" << std::endl; #else std::cerr << "Floating Point Exception (CPU)" << std::endl; @@ -96,7 +97,7 @@ int main( int argc, char** argv ) { // Namespaces for CUDA and C++ (FIXME - eventually use the same namespace everywhere...) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -134,9 +135,11 @@ main( int argc, char** argv ) CurandDevice = 2 }; #ifdef MGONGPU_HAS_NO_CURAND - RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784) + RandomNumberMode rndgen = RandomNumberMode::CommonRandom; // this is the only supported mode if build has no curand (PR #784 and #785) +#elif defined __HIPCC__ +#error Internal error: MGONGPU_HAS_NO_CURAND should have been set for __HIPCC__ // default on AMD GPUs should be common random #elif defined __CUDACC__ - RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on GPU if build has curand + RandomNumberMode rndgen = RandomNumberMode::CurandDevice; // default on NVidia GPU if build has curand #else RandomNumberMode rndgen = RandomNumberMode::CurandHost; // default on CPU if build has curand #endif @@ -146,10 +149,10 @@ main( int argc, char** argv ) RamboHost = 1, RamboDevice = 2 }; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingMode rmbsmp = RamboSamplingMode::RamboDevice; // default on GPU #else - RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU + RamboSamplingMode rmbsmp = RamboSamplingMode::RamboHost; // default on CPU #endif // Bridge emulation mode (NB Bridge implies RamboHost!) bool bridge = false; @@ -177,7 +180,7 @@ main( int argc, char** argv ) else if( arg == "--curdev" ) { #ifndef __CUDACC__ - throw std::runtime_error( "CurandDevice is not supported on CPUs" ); + throw std::runtime_error( "CurandDevice is not supported on CPUs or non-NVidia GPUs" ); #elif defined MGONGPU_HAS_NO_CURAND throw std::runtime_error( "CurandDevice is not supported because this application was built without Curand support" ); #else @@ -198,7 +201,7 @@ main( int argc, char** argv ) } else if( arg == "--rmbdev" ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL rmbsmp = RamboSamplingMode::RamboDevice; #else throw std::runtime_error( "RamboDevice is not supported on CPUs" ); @@ -272,13 +275,13 @@ main( int argc, char** argv ) return usage( argv[0] ); } -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP ompnumthreadsNotSetMeansOneThread( debug ? 1 : 0 ); // quiet(-1), info(0), debug(1) #endif #endif -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Fail gently and avoid "Illegal instruction (core dumped)" if the host does not support the SIMD used in the ME calculation // Note: this prevents a crash on pmpe04 but not on some github CI nodes? // [NB: SIMD vectorization in mg5amc C++ code is only used in the ME calculation below MatrixElementKernelHost!] @@ -296,14 +299,14 @@ main( int argc, char** argv ) // === STEP 0 - INITIALISE -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL - // --- 00. Initialise cuda - // Instantiate a CudaRuntime at the beginnining of the application's main to - // invoke cudaSetDevice(0) in the constructor and book a cudaDeviceReset() call in the destructor - const std::string cdinKey = "00 CudaInit"; + // --- 00. Initialise GPU + // Instantiate a GpuRuntime at the beginnining of the application's main. + // For CUDA this invokes cudaSetDevice(0) in the constructor and books a cudaDeviceReset() call in the destructor. + const std::string cdinKey = "00 GpuInit"; timermap.start( cdinKey ); - CudaRuntime cudaRuntime( debug ); + GpuRuntime GpuRuntime( debug ); #endif // --- 0a. Initialise physics process @@ -325,7 +328,7 @@ main( int argc, char** argv ) timermap.start( alloKey ); // Memory buffers for random numbers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta hstRndmom( nevt ); #else PinnedHostBufferRndNumMomenta hstRndmom( nevt ); @@ -333,7 +336,7 @@ main( int argc, char** argv ) #endif // Memory buffers for sampling weights -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferWeights hstWeights( nevt ); #else PinnedHostBufferWeights hstWeights( nevt ); @@ -341,7 +344,7 @@ main( int argc, char** argv ) #endif // Memory buffers for momenta -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMomenta hstMomenta( nevt ); #else PinnedHostBufferMomenta hstMomenta( nevt ); @@ -349,7 +352,7 @@ main( int argc, char** argv ) #endif // Memory buffers for Gs -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferGs hstGs( nevt ); #else PinnedHostBufferGs hstGs( nevt ); @@ -366,7 +369,7 @@ main( int argc, char** argv ) } // Memory buffers for matrix elements -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferMatrixElements hstMatrixElements( nevt ); #else PinnedHostBufferMatrixElements hstMatrixElements( nevt ); @@ -375,7 +378,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for helicity selection // *** NB #403 these buffers always remain initialised at 0: no need for helicity choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumHelicity hstRndHel( nevt ); #else PinnedHostBufferRndNumHelicity hstRndHel( nevt ); @@ -384,7 +387,7 @@ main( int argc, char** argv ) // Memory buffers for random numbers for color selection // *** NB #402 these buffers always remain initialised at 0: no need for color choice in gcheck/check (no LHE produced) *** -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumColor hstRndCol( nevt ); #else PinnedHostBufferRndNumColor hstRndCol( nevt ); @@ -392,7 +395,7 @@ main( int argc, char** argv ) #endif // Memory buffers for helicity selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedHelicity hstSelHel( nevt ); #else PinnedHostBufferSelectedHelicity hstSelHel( nevt ); @@ -400,7 +403,7 @@ main( int argc, char** argv ) #endif // Memory buffers for color selection -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferSelectedColor hstSelCol( nevt ); #else PinnedHostBufferSelectedColor hstSelCol( nevt ); @@ -438,7 +441,7 @@ main( int argc, char** argv ) const bool onDevice = true; prnk.reset( new CurandRandomNumberKernel( devRndmom, onDevice ) ); #else - throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) + throw std::logic_error( "INTERNAL ERROR! CurandDevice is not supported on CPUs or non-NVidia GPUs" ); // INTERNAL ERROR (no path to this statement) #endif } @@ -450,7 +453,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL prsk.reset( new RamboSamplingKernelDevice( energy, devRndmom, devMomenta, devWeights, gpublocks, gputhreads ) ); #else throw std::logic_error( "RamboDevice is not supported on CPUs" ); // INTERNAL ERROR (no path to this statement) @@ -461,7 +464,7 @@ main( int argc, char** argv ) std::unique_ptr pmek; if( !bridge ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new MatrixElementKernelDevice( devMomenta, devGs, devRndHel, devRndCol, devMatrixElements, devSelHel, devSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new MatrixElementKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -469,7 +472,7 @@ main( int argc, char** argv ) } else { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL pmek.reset( new BridgeKernelDevice( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, gpublocks, gputhreads ) ); #else pmek.reset( new BridgeKernelHost( hstMomenta, hstGs, hstRndHel, hstRndCol, hstMatrixElements, hstSelHel, hstSelCol, nevt ) ); @@ -511,7 +514,7 @@ main( int argc, char** argv ) prnk->generateRnarray(); //std::cout << "Got random numbers" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rndgen != RandomNumberMode::CurandDevice && rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 1c. Copy rndmom from host to device @@ -543,7 +546,7 @@ main( int argc, char** argv ) prsk->getMomentaFinal(); //std::cout << "Got final momenta" << std::endl; -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( rmbsmp == RamboSamplingMode::RamboDevice ) { // --- 2c. CopyDToH Weights @@ -588,7 +591,7 @@ main( int argc, char** argv ) dynamic_cast( pmek.get() )->transposeInputMomentaC2F(); } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // --- 2d. CopyHToD Momenta const std::string gKey = "0.. CpHTDg"; rambtime += timermap.start( gKey ); // FIXME! NOT A RAMBO TIMER! @@ -617,7 +620,7 @@ main( int argc, char** argv ) wv3atime += timermap.stop(); // calc only wavetime += wv3atime; // calc plus copy -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if( !bridge ) { // --- 3b. CopyDToH MEs @@ -760,18 +763,22 @@ main( int argc, char** argv ) rndgentxt = "CURAND DEVICE"; #ifdef __CUDACC__ rndgentxt += " (CUDA code)"; +#elif defined __HIPCC__ + rndgentxt += " (HIP code)"; #else rndgentxt += " (C++ code)"; #endif // Workflow description summary std::string wrkflwtxt; - // -- CUDA or C++? + // -- CUDA or HIP or C++? #ifdef __CUDACC__ wrkflwtxt += "CUD:"; +#elif defined __HIPCC__ + wrkflwtxt += "HIP:"; #else wrkflwtxt += "CPP:"; -#endif +#endif /* clang-format off */ // -- DOUBLE or FLOAT? #if defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT wrkflwtxt += "MIX+"; // mixed fptypes (single precision color algebra #537) @@ -781,8 +788,8 @@ main( int argc, char** argv ) wrkflwtxt += "FLT+"; #else wrkflwtxt += "???+"; // no path to this statement -#endif - // -- CUCOMPLEX or THRUST or STD complex numbers? +#endif /* clang-format on */ + // -- CUCOMPLEX or THRUST or STD or CXSIMPLE complex numbers? #ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX wrkflwtxt += "CUX:"; @@ -793,6 +800,12 @@ main( int argc, char** argv ) #else wrkflwtxt += "???:"; // no path to this statement #endif +#elif defined __HIPCC__ +#if defined MGONGPU_HIPCXTYPE_CXSMPL + wrkflwtxt += "CXS:"; +#else + wrkflwtxt += "???:"; // no path to this statement +#endif #else #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX wrkflwtxt += "STX:"; @@ -818,7 +831,7 @@ main( int argc, char** argv ) wrkflwtxt += "RMBDEV+"; else wrkflwtxt += "??????+"; // no path to this statement -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // -- HOST or DEVICE matrix elements? Standalone MEs or BRIDGE? if( !bridge ) wrkflwtxt += "MESDEV"; @@ -850,7 +863,7 @@ main( int argc, char** argv ) wrkflwtxt += "/sse4"; #endif #else - wrkflwtxt += "/????"; // no path to this statement + wrkflwtxt += "/????"; // no path to this statement #endif // -- Has cxtype_v::operator[] bracket with non-const reference? #if defined MGONGPU_CPPSIMD @@ -874,7 +887,7 @@ main( int argc, char** argv ) if( perf ) { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP // Get the output of "nproc --all" (https://stackoverflow.com/a/478960) std::string nprocall; @@ -895,6 +908,8 @@ main( int argc, char** argv ) std::cout << std::string( SEP79, '*' ) << std::endl #ifdef __CUDACC__ << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CUDA" +#elif defined __HIPCC__ + << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_HIP" #else << "Process = " << XSTRINGIFY( MG_EPOCH_PROCESS_ID ) << "_CPP" #endif @@ -921,21 +936,23 @@ main( int argc, char** argv ) #elif defined MGONGPU_FPTYPE_FLOAT << "FP precision = FLOAT (NaN/abnormal=" << nabn << ", zero=" << nzero << ")" << std::endl #endif -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "Complex type = CUCOMPLEX" << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "Complex type = THRUST::COMPLEX" << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "Complex type = CXSIMPLE" << std::endl +#elif defined MGONGPU_CPPCXTYPE_STDCOMPLEX << "Complex type = STD::COMPLEX" << std::endl +#else + << "Complex type = ???" << std::endl // no path to this statement... #endif << "RanNumb memory layout = AOSOA[" << neppR << "]" << ( neppR == 1 ? " == AOS" : "" ) << " [HARDCODED FOR REPRODUCIBILITY]" << std::endl << "Momenta memory layout = AOSOA[" << neppM << "]" << ( neppM == 1 ? " == AOS" : "" ) << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "Wavefunction GPU memory = LOCAL" << std::endl #else #if !defined MGONGPU_CPPSIMD @@ -966,7 +983,7 @@ main( int argc, char** argv ) #endif #endif << "Random number generation = " << rndgentxt << std::endl -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL #ifdef _OPENMP << "OMP threads / `nproc --all` = " << omp_get_max_threads() << " / " << nprocall // includes a newline #endif @@ -1062,14 +1079,16 @@ main( int argc, char** argv ) << "\"FLOAT (NaN/abnormal=" << nabn << ")\"," << std::endl #endif << "\"Complex type\": " -#ifdef __CUDACC__ #if defined MGONGPU_CUCXTYPE_CUCOMPLEX << "\"CUCOMPLEX\"," << std::endl #elif defined MGONGPU_CUCXTYPE_THRUST << "\"THRUST::COMPLEX\"," << std::endl -#endif -#else +#elif defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL + << "\"CXSIMPLE\"," << std::endl +#elif defined MGONGPU_CUCXTYPE_STDCOMPLEX << "\"STD::COMPLEX\"," << std::endl +#else + << "\"???\"," << std::endl // no path to this statement... #endif << "\"RanNumb memory layout\": " << "\"AOSOA[" << neppR << "]\"" @@ -1077,7 +1096,7 @@ main( int argc, char** argv ) << "\"Momenta memory layout\": " << "\"AOSOA[" << neppM << "]\"" << ( neppM == 1 ? " == AOS" : "" ) << ", " << std::endl -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //<< "\"Wavefunction GPU memory\": " << "\"LOCAL\"," << std::endl #endif << "\"Curand generation\": " diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc index da68aa9255..79abbcc4f8 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "RamboSamplingKernels.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" #include "MemoryAccessMomenta.h" #include "MemoryAccessRandomNumbers.h" #include "MemoryAccessWeights.h" @@ -14,7 +14,7 @@ #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +92,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL RamboSamplingKernelDevice::RamboSamplingKernelDevice( const fptype energy, // input: energy const BufferRndNumMomenta& rndmom, // input: random numbers in [0,1] BufferMomenta& momenta, // output: momenta @@ -135,7 +135,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaInitialDevice( const fptype energy, fptype* momenta ) @@ -147,17 +147,17 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaInitial() { - getMomentaInitialDevice<<>>( m_energy, m_momenta.data() ); + gpuLaunchKernel( getMomentaInitialDevice, m_gpublocks, m_gputhreads, m_energy, m_momenta.data() ); } #endif //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL __global__ void getMomentaFinalDevice( const fptype energy, const fptype* rndmom, @@ -171,11 +171,11 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL void RamboSamplingKernelDevice::getMomentaFinal() { - getMomentaFinalDevice<<>>( m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); + gpuLaunchKernel( getMomentaFinalDevice, m_gpublocks, m_gputhreads, m_energy, m_rndmom.data(), m_momenta.data(), m_weights.data() ); } #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h index 184089efd7..7c214cd74b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RamboSamplingKernels.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RAMBOSAMPLINGKERNELS_H #define RAMBOSAMPLINGKERNELS_H 1 @@ -10,7 +10,7 @@ #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // A class encapsulating RAMBO phase space sampling on a GPU device class RamboSamplingKernelDevice final : public SamplingKernelBase, public NumberOfEvents { diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h index 188a72c2c9..21d63beeac 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/RandomNumberKernels.h @@ -1,14 +1,14 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Dec 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #ifndef RANDOMNUMBERKERNELS_H #define RANDOMNUMBERKERNELS_H 1 #include "mgOnGpuConfig.h" -// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when __CUDACC__ is not defined +// NB This must come AFTER mgOnGpuConfig.h which contains our definition of __global__ when MGONGPUCPP_GPUIMPL is not defined #ifndef MGONGPU_HAS_NO_CURAND //#include "curand.h" struct curandGenerator_st; // forward definition from curand.h @@ -16,7 +16,7 @@ struct curandGenerator_st; // forward definition from curand.h #include "MemoryBuffers.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk index 509307506b..df74dfc284 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/cudacpp.mk @@ -1,7 +1,7 @@ # Copyright (C) 2020-2023 CERN and UCLouvain. # Licensed under the GNU Lesser General Public License (version 3 or later). # Created by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -# Further modified by: O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +# Further modified by: O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #=== Determine the name of this makefile (https://ftp.gnu.org/old-gnu/Manuals/make-3.80/html_node/make_17.html) #=== NB: use ':=' to ensure that the value of CUDACPP_MAKEFILE is not modified further down after including make_opts @@ -42,10 +42,10 @@ endif #------------------------------------------------------------------------------- -#=== Configure common compiler flags for C++ and CUDA +#=== Configure common compiler flags for C++ and CUDA/HIP INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here # Dependency on src directory MG5AMC_COMMONLIB = mg5amc_common @@ -121,24 +121,45 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler +#=== Configure the GPU compiler (CUDA or HIP) + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), we first check for cudacc and hipcc in CUDA_HOME and HIP_HOME. +# If CUDA_HOME or HIP_HOME are not set, try to determine them from the path to cudacc and hipcc. +# While convoluted, this is currently necessary to allow disabling CUDA/HIP builds by setting CUDA_HOME or HIP_HOME to invalid paths. +# This will (probably?) be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). # If CXX is not a single word (example "clang++ --gcc-toolchain...") then disable CUDA builds (issue #505) -# This is because it is impossible to pass this to "CUFLAGS += -ccbin " below +# This is because it is impossible to pass this to "GPUFLAGS += -ccbin " below ifneq ($(words $(subst ccache ,,$(CXX))),1) # allow at most "CXX=ccache " from outside $(warning CUDA builds are not supported for multi-word CXX "$(CXX)") override CUDA_HOME=disabled endif -# If CUDA_HOME is not set, try to set it from the location of nvcc +# If CUDA_HOME is not set, try to set it from the path to nvcc ifndef CUDA_HOME - CUDA_HOME = $(patsubst %bin/nvcc,%,$(shell which nvcc 2>/dev/null)) + CUDA_HOME = $(patsubst %/bin/nvcc,%,$(shell which nvcc 2>/dev/null)) $(warning CUDA_HOME was not set: using "$(CUDA_HOME)") endif -# Set NVCC as $(CUDA_HOME)/bin/nvcc if it exists +# If HIP_HOME is not set, try to set it from the path to hipcc +ifndef HIP_HOME + HIP_HOME = $(patsubst %/bin/hipcc,%,$(shell which hipcc 2>/dev/null)) + $(warning HIP_HOME was not set: using "$(HIP_HOME)") +endif + +# FIXME! (AV 24.01.2024) +# In the current implementation (without separate builds for C++ and CUDA/HIP), +# builds are performed for HIP only if CUDA is not found in the path. +# If both CUDA and HIP are installed, HIP builds can be triggered by unsetting CUDA_HOME. +# This will be fixed when separate C++ and CUDA/HIP builds are implemented (PR #775). + +#--- Option 1: CUDA exists -> use CUDA + +# Set GPUCC as $(CUDA_HOME)/bin/nvcc if it exists ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) - NVCC = $(CUDA_HOME)/bin/nvcc + + GPUCC = $(CUDA_HOME)/bin/nvcc USE_NVTX ?=-DUSE_NVTX # See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html # See https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ @@ -148,7 +169,7 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) # Examples: use 60 for P100 (Piz Daint), 80 for A100 (Juwels Booster, NVidia raplab/Curiosity). MADGRAPH_CUDA_ARCHITECTURE ?= 70 ###CUARCHFLAGS = -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=compute_$(MADGRAPH_CUDA_ARCHITECTURE) -gencode arch=compute_$(MADGRAPH_CUDA_ARCHITECTURE),code=sm_$(MADGRAPH_CUDA_ARCHITECTURE) # Older implementation (AV): go back to this one for multi-GPU support #533 - ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 + ###CUARCHFLAGS = --gpu-architecture=compute_$(MADGRAPH_CUDA_ARCHITECTURE) --gpu-code=sm_$(MADGRAPH_CUDA_ARCHITECTURE),compute_$(MADGRAPH_CUDA_ARCHITECTURE) # Newer implementation (SH): cannot use this as-is for multi-GPU support #533 comma:=, CUARCHFLAGS = $(foreach arch,$(subst $(comma), ,$(MADGRAPH_CUDA_ARCHITECTURE)),-gencode arch=compute_$(arch),code=compute_$(arch) -gencode arch=compute_$(arch),code=sm_$(arch)) CUINC = -I$(CUDA_HOME)/include/ @@ -158,41 +179,79 @@ ifneq ($(wildcard $(CUDA_HOME)/bin/nvcc),) CURANDLIBFLAGS = -L$(CUDA_HOME)/lib64/ -lcurand # NB: -lcuda is not needed here! endif CUOPTFLAGS = -lineinfo - CUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math - ###CUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow - ###NVCC_VERSION = $(shell $(NVCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) - CUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h + ###GPUFLAGS = $(OPTFLAGS) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + GPUFLAGS = $(foreach opt, $(OPTFLAGS), -Xcompiler $(opt)) $(CUOPTFLAGS) $(INCFLAGS) $(CUINC) $(USE_NVTX) $(CUARCHFLAGS) -use_fast_math + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + ###GPUCC_VERSION = $(shell $(GPUCC) --version | grep 'Cuda compilation tools' | cut -d' ' -f5 | cut -d, -f1) + GPUFLAGS += -std=c++17 # need CUDA >= 11.2 (see #333): this is enforced in mgOnGpuConfig.h # Without -maxrregcount: baseline throughput: 6.5E8 (16384 32 12) up to 7.3E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) - ###CUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 160 # improves throughput: 6.9E8 (16384 32 12) up to 7.7E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 128 # improves throughput: 7.3E8 (16384 32 12) up to 7.6E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 96 # degrades throughput: 4.1E8 (16384 32 12) up to 4.5E8 (65536 128 12) + ###GPUFLAGS+= --maxrregcount 64 # degrades throughput: 1.7E8 (16384 32 12) flat at 1.7E8 (65536 128 12) + CUBUILDRULEFLAGS = -Xcompiler -fPIC -c + CCBUILDRULEFLAGS = -Xcompiler -fPIC -c -x cu + CUDATESTFLAGS = -lcuda + + # Set the host C++ compiler for GPUCC via "-ccbin " + # (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) + GPUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) + + # Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) + ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) + GPUFLAGS += -allow-unsupported-compiler + endif + else ifneq ($(origin REQUIRE_CUDA),undefined) + # If REQUIRE_CUDA is set but no cuda is found, stop here (e.g. for CI tests on GPU #443) - $(error No cuda installation found (set CUDA_HOME or make nvcc visible in PATH)) + $(error No cuda installation found (set CUDA_HOME or make GPUCC visible in PATH)) + +#--- Option 2: CUDA does not exist, HIP exists -> use HIP + +# Set GPUCC as $(HIP_HOME)/bin/hipcc if it exists +else ifneq ($(wildcard $(HIP_HOME)/bin/hipcc),) + + GPUCC = $(HIP_HOME)/bin/hipcc + #USE_NVTX ?=-DUSE_NVTX # should maybe find something equivalent to this in HIP? + HIPARCHFLAGS = -target x86_64-linux-gnu --offload-arch=gfx90a + HIPINC = -I$(HIP_HOME)/include/ + # Note: -DHIP_FAST_MATH is equivalent to -use_fast_math in HIP + # (but only for single precision line 208: https://rocm-developer-tools.github.io/HIP/hcc__detail_2math__functions_8h_source.html) + # Note: CUOPTFLAGS should not be used for HIP, it had been added here but was then removed (#808) + GPUFLAGS = $(OPTFLAGS) $(INCFLAGS) $(HIPINC) $(HIPARCHFLAGS) -DHIP_FAST_MATH -DHIP_PLATFORM=amd -fPIC + ###GPUFLAGS += -Xcompiler -Wall -Xcompiler -Wextra -Xcompiler -Wshadow + GPUFLAGS += -std=c++17 + ###GPUFLAGS+= --maxrregcount 255 # (AV: is this option valid on HIP and meaningful on AMD GPUs?) + CUBUILDRULEFLAGS = -fPIC -c + CCBUILDRULEFLAGS = -fPIC -c -x hip + +else ifneq ($(origin REQUIRE_HIP),undefined) + + # If REQUIRE_HIP is set but no HIP is found, stop here (e.g. for CI tests on GPU #443) + $(error No hip installation found (set HIP_HOME or make GPUCC visible in PATH)) + +#--- Option 3: CUDA does not exist, HIP does not exist -> switch off both CUDA and HIP + else - # No cuda. Switch cuda compilation off and go to common random numbers in C++ + + # No cudacc and no hipcc: switch CUDA and HIP compilation off and go to common random numbers in C++ $(warning CUDA_HOME is not set or is invalid: export CUDA_HOME to compile with cuda) - override NVCC= + $(warning HIP_HOME is not set or is invalid: export HIP_HOME to compile with hip) + override GPUCC= override USE_NVTX= override CUINC= override CURANDLIBFLAGS= -endif -export NVCC -export CUFLAGS -# Set the host C++ compiler for nvcc via "-ccbin " -# (NB issue #505: this must be a single word, "clang++ --gcc-toolchain..." is not supported) -CUFLAGS += -ccbin $(shell which $(subst ccache ,,$(CXX))) - -# Allow newer (unsupported) C++ compilers with older versions of CUDA if ALLOW_UNSUPPORTED_COMPILER_IN_CUDA is set (#504) -ifneq ($(origin ALLOW_UNSUPPORTED_COMPILER_IN_CUDA),undefined) -CUFLAGS += -allow-unsupported-compiler endif +# Export GPUCC (so that it can also be used in cudacpp_src.mk?) +export GPUCC +export GPUFLAGS + #------------------------------------------------------------------------------- -#=== Configure ccache for C++ and CUDA builds +#=== Configure ccache for C++ and CUDA/HIP builds # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -201,15 +260,15 @@ endif #ifeq ($(USECCACHE)$(shell echo $(AR) | grep ccache),1) # override AR:=ccache $(AR) #endif -ifneq ($(NVCC),) - ifeq ($(USECCACHE)$(shell echo $(NVCC) | grep ccache),1) - override NVCC:=ccache $(NVCC) +ifneq ($(GPUCC),) + ifeq ($(USECCACHE)$(shell echo $(GPUCC) | grep ccache),1) + override GPUCC:=ccache $(GPUCC) endif endif #------------------------------------------------------------------------------- -#=== Configure PowerPC-specific compiler flags for C++ and CUDA +#=== Configure PowerPC-specific compiler flags for C++ and CUDA/HIP # PowerPC-specific CXX compiler flags (being reviewed) ifeq ($(UNAME_P),ppc64le) @@ -225,9 +284,9 @@ else ######CXXFLAGS+= -fno-semantic-interposition # no benefit (neither alone, nor combined with -flto) endif -# PowerPC-specific CUDA compiler flags (to be reviewed!) +# PowerPC-specific CUDA/HIP compiler flags (to be reviewed!) ifeq ($(UNAME_P),ppc64le) - CUFLAGS+= -Xcompiler -mno-float128 + GPUFLAGS+= -Xcompiler -mno-float128 endif #------------------------------------------------------------------------------- @@ -235,9 +294,11 @@ endif #=== Configure defaults and check if user-defined choices exist for OMPFLAGS, AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the default OMPFLAGS choice -ifneq ($(shell $(CXX) --version | egrep '^Intel'),) +ifneq ($(findstring hipcc,$(GPUCC)),) +override OMPFLAGS = # disable OpenMP MT when using hipcc #802 +else ifneq ($(shell $(CXX) --version | egrep '^Intel'),) override OMPFLAGS = -fopenmp -###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without nvcc but not ok with nvcc before #578) +###override OMPFLAGS = # disable OpenMP MT on Intel (was ok without GPUCC but not ok with GPUCC before #578) else ifneq ($(shell $(CXX) --version | egrep '^(clang)'),) override OMPFLAGS = -fopenmp ###override OMPFLAGS = # disable OpenMP MT on clang (was not ok without or with nvcc before #578) @@ -293,7 +354,10 @@ endif # Set the default RNDGEN (random number generator) choice ifeq ($(RNDGEN),) - ifeq ($(NVCC),) + ifeq ($(GPUCC),) + override RNDGEN = hasNoCurand + # Edgecase for HIP compilation + else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) override RNDGEN = hasNoCurand else ifeq ($(RNDGEN),) override RNDGEN = hasCurand @@ -310,7 +374,7 @@ export OMPFLAGS #------------------------------------------------------------------------------- -#=== Set the CUDA/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN +#=== Set the CUDA/HIP/C++ compiler flags appropriate to user-defined choices of AVX, FPTYPE, HELINL, HRDCOD, RNDGEN # Set the build flags appropriate to OMPFLAGS $(info OMPFLAGS=$(OMPFLAGS)) @@ -368,13 +432,13 @@ CXXFLAGS+= $(AVXFLAGS) $(info FPTYPE=$(FPTYPE)) ifeq ($(FPTYPE),d) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_DOUBLE else ifeq ($(FPTYPE),f) CXXFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_FLOAT -DMGONGPU_FPTYPE2_FLOAT else ifeq ($(FPTYPE),m) CXXFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT - CUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT + GPUFLAGS += -DMGONGPU_FPTYPE_DOUBLE -DMGONGPU_FPTYPE2_FLOAT else $(error Unknown FPTYPE='$(FPTYPE)': only 'd', 'f' and 'm' are supported) endif @@ -383,7 +447,7 @@ endif $(info HELINL=$(HELINL)) ifeq ($(HELINL),1) CXXFLAGS += -DMGONGPU_INLINE_HELAMPS - CUFLAGS += -DMGONGPU_INLINE_HELAMPS + GPUFLAGS += -DMGONGPU_INLINE_HELAMPS else ifneq ($(HELINL),0) $(error Unknown HELINL='$(HELINL)': only '0' and '1' are supported) endif @@ -392,7 +456,7 @@ endif $(info HRDCOD=$(HRDCOD)) ifeq ($(HRDCOD),1) CXXFLAGS += -DMGONGPU_HARDCODE_PARAM - CUFLAGS += -DMGONGPU_HARDCODE_PARAM + GPUFLAGS += -DMGONGPU_HARDCODE_PARAM else ifneq ($(HRDCOD),0) $(error Unknown HRDCOD='$(HRDCOD)': only '0' and '1' are supported) endif @@ -444,11 +508,11 @@ ifeq ($(UNAME_S),Darwin) override CULIBFLAGSRPATH2 = else # RPATH to cuda/cpp libs when linking executables - override CXXLIBFLAGSRPATH = -Wl,-rpath,$(LIBDIRRPATH) - override CULIBFLAGSRPATH = -Xlinker -rpath,$(LIBDIRRPATH) + override CXXLIBFLAGSRPATH = -Wl,-rpath=$(LIBDIRRPATH) + override CULIBFLAGSRPATH = -Xlinker -rpath=$(LIBDIRRPATH) # RPATH to common lib when linking cuda/cpp libs - override CXXLIBFLAGSRPATH2 = -Wl,-rpath,'$$ORIGIN' - override CULIBFLAGSRPATH2 = -Xlinker -rpath,'$$ORIGIN' + override CXXLIBFLAGSRPATH2 = -Wl,-rpath='$$ORIGIN' + override CULIBFLAGSRPATH2 = -Xlinker -rpath='$$ORIGIN' endif # Setting LD_LIBRARY_PATH or DYLD_LIBRARY_PATH in the RUNTIME is no longer necessary (neither on Linux nor on Mac) @@ -461,7 +525,7 @@ override RUNTIME = cxx_main=$(BUILDDIR)/check.exe fcxx_main=$(BUILDDIR)/fcheck.exe -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_main=$(BUILDDIR)/gcheck.exe fcu_main=$(BUILDDIR)/fgcheck.exe else @@ -492,14 +556,15 @@ $(BUILDDIR)/.build.$(TAG): @touch $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation -ifneq ($(NVCC),) +# NB: CCBUILDRULEFLAGS includes "-x cu" for nvcc and "-x hip" for hipcc (#810) +ifneq ($(GPUCC),) $(BUILDDIR)/%.o : %.cu *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CUBUILDRULEFLAGS) $< -o $@ $(BUILDDIR)/%_cu.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $(CCBUILDRULEFLAGS) $< -o $@ endif # Generic target and build rules: objects from C++ compilation @@ -509,11 +574,14 @@ $(BUILDDIR)/%.o : %.cc *.h ../../src/*.h $(BUILDDIR)/.build.$(TAG) $(CXX) $(CPPFLAGS) $(CXXFLAGS) -fPIC -c $< -o $@ # Apply special build flags only to CrossSectionKernel.cc and gCrossSectionKernel.cu (no fast math, see #117 and #516) +# Added edgecase for HIP compilation ifeq ($(shell $(CXX) --version | grep ^nvc++),) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS := $(filter-out -ffast-math,$(CXXFLAGS)) $(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -fno-fast-math -ifneq ($(NVCC),) -$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -fno-fast-math +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -fno-fast-math +else + $(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -fno-fast-math endif endif @@ -530,10 +598,10 @@ ifeq ($(RNDGEN),hasCurand) $(BUILDDIR)/CurandRandomNumberKernel.o: CXXFLAGS += $(CUINC) endif -# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in nvcc with icx2023 (#592) +# Avoid "warning: builtin __has_trivial_... is deprecated; use __is_trivially_... instead" in GPUCC with icx2023 (#592) ifneq ($(shell $(CXX) --version | egrep '^(Intel)'),) -ifneq ($(NVCC),) -CUFLAGS += -Xcompiler -Wno-deprecated-builtins +ifneq ($(GPUCC),) +GPUFLAGS += -Wno-deprecated-builtins endif endif @@ -541,8 +609,8 @@ endif # This patch does remove the warning, but I prefer to keep it disabled for the moment... ###ifneq ($(shell $(CXX) --version | egrep '^(clang|Apple clang|Intel)'),) ###$(BUILDDIR)/CrossSectionKernels.o: CXXFLAGS += -Wno-overriding-t-option -###ifneq ($(NVCC),) -###$(BUILDDIR)/gCrossSectionKernels.o: CUFLAGS += -Xcompiler -Wno-overriding-t-option +###ifneq ($(GPUCC),) +###$(BUILDDIR)/gCrossSectionKernels.o: GPUFLAGS += -Xcompiler -Wno-overriding-t-option ###endif ###endif @@ -569,7 +637,7 @@ MG5AMC_CXXLIB = mg5amc_$(processid_short)_cpp cxx_objects_lib=$(BUILDDIR)/CPPProcess.o $(BUILDDIR)/MatrixElementKernels.o $(BUILDDIR)/BridgeKernels.o $(BUILDDIR)/CrossSectionKernels.o cxx_objects_exe=$(BUILDDIR)/CommonRandomNumberKernel.o $(BUILDDIR)/RamboSamplingKernels.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) MG5AMC_CULIB = mg5amc_$(processid_short)_cuda cu_objects_lib=$(BUILDDIR)/gCPPProcess.o $(BUILDDIR)/gMatrixElementKernels.o $(BUILDDIR)/gBridgeKernels.o $(BUILDDIR)/gCrossSectionKernels.o cu_objects_exe=$(BUILDDIR)/gCommonRandomNumberKernel.o $(BUILDDIR)/gRamboSamplingKernels.o @@ -581,11 +649,17 @@ $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: cxx_objects_lib += $(BUILDDIR)/fbridge.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(CXX) -shared -o $@ $(cxx_objects_lib) $(CXXLIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: cu_objects_lib += $(BUILDDIR)/fbridge_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so: $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cu_objects_lib) - $(NVCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) + $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) -lstdc++fs +#else +# $(GPUCC) --shared -o $@ $(cu_objects_lib) $(CULIBFLAGSRPATH2) -L$(LIBDIR) -l$(MG5AMC_COMMONLIB) +#endif endif #------------------------------------------------------------------------------- @@ -602,16 +676,16 @@ $(cxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PAT $(cxx_main): $(BUILDDIR)/check_sa.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CXX) -o $@ $(BUILDDIR)/check_sa.o $(OMPFLAGS) -ldl -pthread $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) $(BUILDDIR)/CurandRandomNumberKernel.o $(CURANDLIBFLAGS) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(cu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(cu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(cu_main): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif $(cu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(cu_main): $(BUILDDIR)/gcheck_sa.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o - $(NVCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) + $(GPUCC) -o $@ $(BUILDDIR)/gcheck_sa.o $(CUARCHFLAGS) $(LIBFLAGS) -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) $(BUILDDIR)/gCurandRandomNumberKernel.o $(CURANDLIBFLAGS) endif #------------------------------------------------------------------------------- @@ -635,19 +709,27 @@ $(fcxx_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libg endif $(fcxx_main): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcxx_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler.o $(LIBDIR)/lib$(MG5AMC_CXXLIB).so $(cxx_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) -lstdc++ +else $(CXX) -o $@ $(BUILDDIR)/fcheck_sa.o $(OMPFLAGS) $(BUILDDIR)/fsampler.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CXXLIB) $(cxx_objects_exe) +endif -ifneq ($(NVCC),) +ifneq ($(GPUCC),) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(fcu_main): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(fcu_main): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') endif ifeq ($(UNAME_S),Darwin) $(fcu_main): LIBFLAGS += -L$(shell dirname $(shell $(FC) --print-file-name libgfortran.dylib)) # add path to libgfortran on Mac #375 endif $(fcu_main): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(fcu_main): $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBDIR)/lib$(MG5AMC_CULIB).so $(cu_objects_exe) - $(NVCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) -lstdc++ -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(BUILDDIR)/fcheck_sa.o $(BUILDDIR)/fsampler_cu.o $(LIBFLAGS) -lgfortran -L$(LIBDIR) -l$(MG5AMC_CULIB) $(cu_objects_exe) +endif endif #------------------------------------------------------------------------------- @@ -659,7 +741,7 @@ $(BUILDDIR)/testxxx.o: testxxx_cc_ref.txt $(testmain): $(BUILDDIR)/testxxx.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testxxx.o # Comment out this line to skip the C++ test of xxx functions -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testxxx_cu.o: $(GTESTLIBS) $(BUILDDIR)/testxxx_cu.o: INCFLAGS += $(GTESTINC) $(BUILDDIR)/testxxx_cu.o: testxxx_cc_ref.txt @@ -672,7 +754,7 @@ $(BUILDDIR)/testmisc.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc.o $(testmain): cxx_objects_exe += $(BUILDDIR)/testmisc.o # Comment out this line to skip the C++ miscellaneous tests -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/testmisc_cu.o: $(GTESTLIBS) $(BUILDDIR)/testmisc_cu.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/testmisc_cu.o @@ -684,12 +766,12 @@ $(BUILDDIR)/runTest.o: INCFLAGS += $(GTESTINC) $(testmain): $(BUILDDIR)/runTest.o $(testmain): cxx_objects_exe += $(BUILDDIR)/runTest.o -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(BUILDDIR)/runTest_cu.o: $(GTESTLIBS) $(BUILDDIR)/runTest_cu.o: INCFLAGS += $(GTESTINC) ifneq ($(shell $(CXX) --version | grep ^Intel),) -$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with nvcc (undefined reference to `_intel_fast_memcpy') -$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with nvcc (undefined reference to `__svml_cos4_l9') +$(testmain): LIBFLAGS += -lintlc # compile with icpx and link with GPUCC (undefined reference to `_intel_fast_memcpy') +$(testmain): LIBFLAGS += -lsvml # compile with icpx and link with GPUCC (undefined reference to `__svml_cos4_l9') else ifneq ($(shell $(CXX) --version | grep ^nvc++),) # support nvc++ #531 $(testmain): LIBFLAGS += -L$(patsubst %bin/nvc++,%lib,$(subst ccache ,,$(CXX))) -lnvhpcatm -lnvcpumath -lnvc endif @@ -713,14 +795,23 @@ $(testmain): LIBFLAGS += -lgomp endif endif -ifeq ($(NVCC),) # link only runTest.o +# Bypass std::filesystem completely to ease portability on LUMI #803 +#ifneq ($(findstring hipcc,$(GPUCC)),) +#$(testmain): LIBFLAGS += -lstdc++fs +#endif + +ifeq ($(GPUCC),) # link only runTest.o $(testmain): LIBFLAGS += $(CXXLIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(GTESTLIBS) $(CXX) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) -ldl -pthread $(LIBFLAGS) else # link both runTest.o and runTest_cu.o $(testmain): LIBFLAGS += $(CULIBFLAGSRPATH) # avoid the need for LD_LIBRARY_PATH $(testmain): $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) $(GTESTLIBS) - $(NVCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) -lcuda +ifneq ($(findstring hipcc,$(GPUCC)),) # link fortran/c++/hip using $FC when hipcc is used #802 + $(FC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) -lstdc++ -lpthread -L$(shell dirname $(shell $(GPUCC) -print-prog-name=clang))/../../lib -lamdhip64 +else + $(GPUCC) -o $@ $(cxx_objects_lib) $(cxx_objects_exe) $(cu_objects_lib) $(cu_objects_exe) -ldl $(LIBFLAGS) $(CUDATESTFLAGS) +endif endif # Use target gtestlibs to build only googletest @@ -829,9 +920,9 @@ ifeq ($(USECCACHE),1) ccache --version | head -1 endif @echo "" - @echo NVCC=$(NVCC) -ifneq ($(NVCC),) - $(NVCC) --version + @echo GPUCC=$(GPUCC) +ifneq ($(GPUCC),) + $(GPUCC) --version endif @echo "" @echo CXX=$(CXX) @@ -850,7 +941,7 @@ endif # Target: check (run the C++ test executable) # [NB THIS IS WHAT IS USED IN THE GITHUB CI!] -ifneq ($(NVCC),) +ifneq ($(GPUCC),) check: runTest cmpFcheck cmpFGcheck else check: runTest cmpFcheck diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc index 2d2b36d560..22ce3f5115 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fbridge.cc @@ -1,11 +1,11 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Roiser (Oct 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Roiser, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Roiser, J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. #include "Bridge.h" #include "CPPProcess.h" -#include "CudaRuntime.h" +#include "GpuRuntime.h" extern "C" { @@ -22,7 +22,7 @@ extern "C" * Using the same Fortran MadEvent code, linking to the hetrerogeneous library would allow access to both CPU and GPU implementations. * The specific heterogeneous configuration (how many GPUs, how many threads on each CPU, etc) could be loaded in CUDA/C++ from a data file. */ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -46,8 +46,8 @@ extern "C" */ void fbridgecreate_( CppObjectInFortran** ppbridge, const int* pnevtF, const int* pnparF, const int* pnp4F ) { -#ifdef __CUDACC__ - CudaRuntime::setUp(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::setUp(); #endif // (NB: CPPProcess::initProc no longer needs to be executed here because it is called in the Bridge constructor) // FIXME: disable OMP in Bridge when called from Fortran @@ -65,8 +65,8 @@ extern "C" Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgedelete_: invalid Bridge address" ); delete pbridge; -#ifdef __CUDACC__ - CudaRuntime::tearDown(); +#ifdef MGONGPUCPP_GPUIMPL + GpuRuntime::tearDown(); #endif } @@ -96,7 +96,7 @@ extern "C" { Bridge* pbridge = dynamic_cast*>( *ppbridge ); if( pbridge == 0 ) throw std::runtime_error( "fbridgesequence_: invalid Bridge address" ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // Use the device/GPU implementation in the CUDA library // (there is also a host implementation in this library) pbridge->gpu_sequence( momenta, gs, rndhel, rndcol, *pchannelId, mes, selhel, selcol ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc index 2fb445372d..3743934f41 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/fsampler.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Feb 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #include "mgOnGpuConfig.h" @@ -13,7 +13,7 @@ //-------------------------------------------------------------------------- -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -40,7 +40,7 @@ namespace mg5amcCpu private: const int m_nevt; // The number of events in each iteration int m_iiter; // The iteration counter (for random number seeding) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL HostBufferRndNumMomenta m_hstRndmom; // Memory buffers for random numbers HostBufferMomenta m_hstMomenta; // Memory buffers for momenta HostBufferWeights m_hstWeights; // Memory buffers for sampling weights @@ -105,7 +105,7 @@ namespace mg5amcCpu extern "C" { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc index d4a760a71b..de327f2321 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/runTest.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: S. Hageboeck (Nov 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -18,7 +18,7 @@ #include "RandomNumberKernels.h" #include "epoch_process_id.h" -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -35,7 +35,7 @@ struct CUDA_CPU_TestBase : public TestDriverBase : TestDriverBase( npar, refFileName ) {} }; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL struct CPUTest : public CUDA_CPU_TestBase { // Struct data members (process, and memory structures for random numbers, momenta, matrix elements and weights on host and device) @@ -119,7 +119,7 @@ struct CPUTest : public CUDA_CPU_TestBase }; #endif -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL struct CUDATest : public CUDA_CPU_TestBase { // Reset the device when our test goes out of scope. Note that this should happen after @@ -128,7 +128,7 @@ struct CUDATest : public CUDA_CPU_TestBase { ~DeviceReset() { - checkCuda( cudaDeviceReset() ); // this is needed by cuda-memcheck --leak-check full + checkGpu( gpuDeviceReset() ); // this is needed by cuda-memcheck --leak-check full } } deviceResetter; @@ -256,7 +256,7 @@ INSTANTIATE_TEST_SUITE_P( prefix, \ test_suite_name, \ testing::Values( new CUDATest( MG_EPOCH_REFERENCE_FILE_NAME ) ) ); -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL MG_INSTANTIATE_TEST_SUITE_GPU( XTESTID_GPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); #else MG_INSTANTIATE_TEST_SUITE_CPU( XTESTID_CPU( MG_EPOCH_PROCESS_ID ), MadgraphTest ); diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc index 895d6eeb56..ba9e59a8a3 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testmisc.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*misc to run only testmisc.cc tests //---------------------------------------------------------------------------- @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_MISC #else #define TESTID( s ) s##_CPU_MISC @@ -26,7 +26,7 @@ #define XTESTID( s ) TESTID( s ) // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -59,7 +59,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testmisc ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc index 3361fe5aa9..e5167de00c 100644 --- a/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc +++ b/epochX/cudacpp/pp_tt012j.mad/SubProcesses/testxxx.cc @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Apr 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //---------------------------------------------------------------------------- // Use ./runTest.exe --gtest_filter=*xxx to run only testxxx.cc tests //---------------------------------------------------------------------------- @@ -24,7 +24,7 @@ #include #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #define TESTID( s ) s##_GPU_XXX #else #define TESTID( s ) s##_CPU_XXX @@ -32,7 +32,7 @@ #define XTESTID( s ) TESTID( s ) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -42,7 +42,7 @@ namespace mg5amcCpu int FPEhandlerIevt = -1; inline void FPEhandler( int sig ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL std::cerr << "Floating Point Exception (GPU): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; #else std::cerr << "Floating Point Exception (CPU neppV=" << neppV << "): '" << FPEhandlerMessage << "' ievt=" << FPEhandlerIevt << std::endl; @@ -53,7 +53,7 @@ namespace mg5amcCpu TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; @@ -77,7 +77,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) assert( nevt % neppM == 0 ); // nevt must be a multiple of neppM assert( nevt % neppV == 0 ); // nevt must be a multiple of neppV // Fill in the input momenta -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL mg5amcGpu::PinnedHostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] #else mg5amcCpu::HostBufferMomenta hstMomenta( nevt ); // AOSOA[npagM][npar=4][np4=4][neppM] @@ -322,7 +322,7 @@ TEST( XTESTID( MG_EPOCH_PROCESS_ID ), testxxx ) { for( int ievt = 0; ievt < nevt; ievt++ ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py index 3b09713e12..90af6b7053 100644 --- a/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py +++ b/epochX/cudacpp/pp_tt012j.mad/bin/internal/launch_plugin.py @@ -103,7 +103,7 @@ def default_setup(self): fct_mod=(self.reset_makeopts,(),{}), allowed=['auto', 'none', 'sse4', 'avx2','512y','512z']) self.add_param('cudacpp_backend', 'CPP', include=False, hidden=False, - allowed=['Fortan', 'CPP', 'CUDA']) + allowed=['Fortran', 'CPP', 'CUDA']) self['vector_size'] = 16 # already setup in default class (just change value) self['aloha_flag'] = '--fast-math' self['matrix_flag'] = '-O3' diff --git a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h index 8df465ad6d..8b4ad719be 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/HelAmps_sm.h @@ -5,7 +5,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -28,7 +28,7 @@ //#include //#include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc index 64fc3fea62..067445b198 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.cc @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: A. Valassi (Sep 2021) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2021-2023) for the MG5aMC CUDACPP plugin. //========================================================================== // This file has been automatically generated for CUDA/C++ standalone by // MadGraph5_aMC@NLO v. 3.5.2_lo_vect, 2023-11-08 @@ -17,7 +17,7 @@ #include #include -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL using namespace mg5amcGpu; #else using namespace mg5amcCpu; diff --git a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h index b6568d3761..9581d66e0e 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/Parameters_sm.h @@ -27,7 +27,7 @@ #include "read_slha.h" // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -93,7 +93,7 @@ namespace mg5amcCpu #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -218,7 +218,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -239,7 +239,7 @@ namespace mg5amcCpu #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-variable" // e.g. <> #pragma GCC diagnostic ignored "-Wunused-parameter" // e.g. <> -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma nv_diagnostic push #pragma nv_diag_suppress 177 // e.g. <> #endif @@ -267,7 +267,7 @@ namespace mg5amcCpu // End SM implementation - no special handling of vectors of floats as in EFT (#439) return out; } -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL #pragma GCC diagnostic pop #pragma nv_diagnostic pop #endif diff --git a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk index d4cc628aec..b2b9da5288 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk +++ b/epochX/cudacpp/pp_tt012j.mad/src/cudacpp_src.mk @@ -19,7 +19,7 @@ SHELL := /bin/bash #=== Configure common compiler flags for CUDA and C++ INCFLAGS = -I. -OPTFLAGS = -O3 # this ends up in CUFLAGS too (should it?), cannot add -Ofast or -ffast-math here +OPTFLAGS = -O3 # this ends up in GPUFLAGS too (should it?), cannot add -Ofast or -ffast-math here #------------------------------------------------------------------------------- @@ -45,13 +45,13 @@ endif #------------------------------------------------------------------------------- -#=== Configure the CUDA compiler (note: NVCC is already exported including ccache) +#=== Configure the CUDA compiler (note: GPUCC is already exported including ccache) -###$(info NVCC=$(NVCC)) +###$(info GPUCC=$(GPUCC)) #------------------------------------------------------------------------------- -#=== Configure ccache for C++ builds (note: NVCC is already exported including ccache) +#=== Configure ccache for C++ builds (note: GPUCC is already exported including ccache) # Enable ccache if USECCACHE=1 ifeq ($(USECCACHE)$(shell echo $(CXX) | grep ccache),1) @@ -92,6 +92,13 @@ endif ###$(info OMPFLAGS=$(OMPFLAGS)) CXXFLAGS += $(OMPFLAGS) +# Add correct flags for nvcc (-x cu) and hipcc (-x hip) for GPU code (see #810) +ifeq ($(findstring nvcc,$(GPUCC)),nvcc) + GPUFLAGS += -Xcompiler -fPIC -c -x cu +else ifeq ($(findstring hipcc,$(GPUCC)),hipcc) + GPUFLAGS += -fPIC -c -x hip +endif + # Set the build flags appropriate to each AVX choice (example: "make AVX=none") # [NB MGONGPU_PVW512 is needed because "-mprefer-vector-width=256" is not exposed in a macro] # [See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96476] @@ -253,20 +260,20 @@ $(BUILDDIR)/%.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) # Generic target and build rules: objects from CUDA compilation $(BUILDDIR)/%_cu.o : %.cc *.h $(BUILDDIR)/.build.$(TAG) @if [ ! -d $(BUILDDIR) ]; then echo "mkdir -p $(BUILDDIR)"; mkdir -p $(BUILDDIR); fi - $(NVCC) $(CPPFLAGS) $(CUFLAGS) -Xcompiler -fPIC -c -x cu $< -o $@ + $(GPUCC) $(CPPFLAGS) $(GPUFLAGS) $< -o $@ #------------------------------------------------------------------------------- cxx_objects=$(addprefix $(BUILDDIR)/, Parameters_sm.o read_slha.o) -ifneq ($(NVCC),) +ifneq ($(GPUCC),) cu_objects=$(addprefix $(BUILDDIR)/, Parameters_sm_cu.o) endif # Target (and build rules): common (src) library -ifneq ($(NVCC),) +ifneq ($(GPUCC),) $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) $(cu_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi - $(NVCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) + $(GPUCC) -shared -o $@ $(cxx_objects) $(cu_objects) $(LDFLAGS) else $(LIBDIR)/lib$(MG5AMC_COMMONLIB).so : $(cxx_objects) @if [ ! -d $(LIBDIR) ]; then echo "mkdir -p $(LIBDIR)"; mkdir -p $(LIBDIR); fi diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h index 80032e528b..6bde4466d0 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuConfig.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jul 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCONFIG_H #define MGONGPUCONFIG_H 1 @@ -10,12 +10,26 @@ // There are two different code bases for standalone_cudacpp (without multichannel) and madevent+cudacpp (with multichannel) #define MGONGPU_SUPPORTS_MULTICHANNEL 1 +// Is this a GPU (CUDA, HIP) or CPU implementation? +#ifdef __CUDACC__ +#define MGONGPUCPP_GPUIMPL cuda +#elif defined __HIPCC__ +#define MGONGPUCPP_GPUIMPL hip +#include "hip/hip_runtime.h" // needed for blockDim, blockIdx, threadIdx: better in mgOnGpuConfig.h than in GpuAbstraction.h +#else +#undef MGONGPUCPP_GPUIMPL +#endif + // ** NB1 Throughputs (e.g. 6.8E8) are events/sec for "./gcheck.exe -p 65536 128 12" // ** NB2 Baseline on b7g47n0004 fluctuates (probably depends on load on other VMs) // Choose if curand is supported for generating random numbers +// For HIP, by default, do not use curand (common random numbers will be used instead) // For both CUDA and C++, by default, do not inline, but allow this macro to be set from outside with e.g. -DMGONGPU_HAS_NO_CURAND -// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784) +// (there exist CUDA installations, e.g. using the HPC package, which do not include curand - see PR #784 and #785) +#if defined __HIPCC__ +#define MGONGPU_HAS_NO_CURAND 1 +#else //#ifdef __CUDACC__ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 @@ -23,6 +37,7 @@ //#undef MGONGPU_HAS_NO_CURAND // default ////#define MGONGPU_HAS_NO_CURAND 1 //#endif +#endif // Choose floating point precision (for everything but color algebra #537) // If one of these macros has been set from outside with e.g. -DMGONGPU_FPTYPE_FLOAT, nothing happens (issue #167) @@ -54,23 +69,31 @@ //#undef MGONGPU_HARDCODE_PARAM // default ////#define MGONGPU_HARDCODE_PARAM 1 -// Complex type in c++: std::complex or cxsmpl (CHOOSE ONLY ONE) -#ifndef __CUDACC__ -//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) -#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) -#endif - -// Complex type in cuda: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// Complex type in CUDA: thrust or cucomplex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CU*CXTYPE_xxx) #ifdef __CUDACC__ #define MGONGPU_CUCXTYPE_THRUST 1 // default (~1.15E9/double, ~3.2E9/float) //#define MGONGPU_CUCXTYPE_CUCOMPLEX 1 // ~10 percent slower (1.03E9/double, ~2.8E9/float) //#define MGONGPU_CUCXTYPE_CXSMPL 1 // ~10 percent slower (1.00E9/double, ~2.9E9/float) + +// Complex type in HIP: cxsmpl (ONLY ONE OPTION POSSIBLE? #810) +// (NB THIS IS MGONGPU_*HIP*CXTYPE_xxx) +#elif defined __HIPCC__ +#define MGONGPU_HIPCXTYPE_CXSMPL 1 // default for HIP + +// Complex type in C++: std::complex or cxsmpl (CHOOSE ONLY ONE) +// (NB THIS IS MGONGPU_*CPP*CXTYPE_xxx) +#else +//#define MGONGPU_CPPCXTYPE_STDCOMPLEX 1 // ~8 percent slower on float, same on double (5.1E6/double, 9.4E6/float) +#define MGONGPU_CPPCXTYPE_CXSMPL 1 // new default (5.1E6/double, 10.2E6/float) #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation #ifdef __CUDACC__ -#undef MGONGPU_NSIGHT_DEBUG // default +#undef MGONGPU_NSIGHT_DEBUG // default in CUDA //#define MGONGPU_NSIGHT_DEBUG 1 +#else +#undef MGONGPU_NSIGHT_DEBUG // only option in HIP or C++ #endif // SANITY CHECKS (floating point precision for everything but color algebra #537) @@ -86,17 +109,21 @@ #error You cannot use double precision for color algebra and single precision elsewhere #endif -// SANITY CHECKS (c++ complex number implementation) -#ifndef __CUDACC__ -#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL +// SANITY CHECKS (CUDA complex number implementation) +#ifdef __CUDACC__ +#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX for CUDA +#elif defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CXSMPL for CUDA +#elif defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE OF MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL for CUDA #endif #endif -// SANITY CHECKS (cuda complex number implementation) -#ifdef __CUDACC__ -#if defined MGONGPU_CUCXTYPE_THRUST and defined MGONGPU_CUCXTYPE_CUCOMPLEX and defined MGONGPU_CUCXTYPE_CXSMPL -#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL +// SANITY CHECKS (C++ complex number implementation) +#ifndef MGONGPUCPP_GPUIMPL +#if defined MGONGPU_CPPCXTYPE_STDCOMPLEX and defined MGONGPU_CPPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CPPCXTYPE_STDCOMPLEX or MGONGPU_CPPCXTYPE_CXSMPL for C++ #endif #endif @@ -134,7 +161,7 @@ namespace mgOnGpu // Alignment requirement for using reinterpret_cast with SIMD vectorized code // (using reinterpret_cast with non aligned memory may lead to segmentation faults!) // Only needed for C++ code but can be enforced also in NVCC builds of C++ code using CUDA>=11.2 and C++17 (#318, #319, #333) -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL constexpr int cppAlign = 64; // alignment requirement for SIMD vectorization (64-byte i.e. 512-bit) #endif @@ -145,7 +172,7 @@ using mgOnGpu::fptype; using mgOnGpu::fptype2; // C++ SIMD vectorization width (this will be used to set neppV) -#ifdef __CUDACC__ // CUDA implementation has no SIMD +#ifdef MGONGPUCPP_GPUIMPL // CUDA and HIP implementations have no SIMD #undef MGONGPU_CPPSIMD #elif defined __AVX512VL__ && defined MGONGPU_PVW512 // C++ "512z" AVX512 with 512 width (512-bit ie 64-byte): 8 (DOUBLE) or 16 (FLOAT) #ifdef MGONGPU_FPTYPE_DOUBLE @@ -175,9 +202,9 @@ using mgOnGpu::fptype2; #undef MGONGPU_CPPSIMD #endif -// Cuda nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation +// CUDA nsight compute (ncu) debug: add dummy lines to ease SASS program flow navigation // Arguments (not used so far): text is __FUNCTION__, code is 0 (start) or 1 (end) -#if defined __CUDACC__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ +#if defined __CUDA__ && defined MGONGPU_NSIGHT_DEBUG /* clang-format off */ #define mgDebugDeclare() __shared__ float mgDebugCounter[mgOnGpu::ntpbMAX]; #define mgDebugInitialise() { mgDebugCounter[threadIdx.x] = 0; } #define mgDebug( code, text ) { mgDebugCounter[threadIdx.x] += 1; } @@ -189,8 +216,8 @@ using mgOnGpu::fptype2; #define mgDebugFinalise() { /*noop*/ } #endif /* clang-format on */ -// Define empty CUDA declaration specifiers for C++ -#ifndef __CUDACC__ +// Define empty CUDA/HIP declaration specifiers for C++ +#ifndef MGONGPUCPP_GPUIMPL #define __global__ #define __host__ #define __device__ diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h index ca9a9f00c0..7ede1dbfae 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuCxtypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022, based on earlier work by D. Smith) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUCXTYPES_H #define MGONGPUCXTYPES_H 1 @@ -19,7 +19,7 @@ #include // Complex type in cuda: thrust or cucomplex or cxsmpl -#ifdef __CUDACC__ +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wtautological-compare" // for icpx2021/clang13 (https://stackoverflow.com/a/15864661) @@ -30,8 +30,13 @@ #elif not defined MGONGPU_CUCXTYPE_CXSMPL #error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_CUCXTYPE_THRUST or MGONGPU_CUCXTYPE_CUCOMPLEX or MGONGPU_CUCXTYPE_CXSMPL #endif +// Complex type in HIP: cxsmpl +#elif defined __HIPCC__ +#if not defined MGONGPU_HIPCXTYPE_CXSMPL +#error You must CHOOSE (ONE AND) ONLY ONE of MGONGPU_HIPCXTYPE_CXSMPL +#endif #else -// Complex type in c++: std::complex or cxsmpl +// Complex type in c++ or HIP: std::complex or cxsmpl #if defined MGONGPU_CPPCXTYPE_STDCOMPLEX #include #elif not defined MGONGPU_CPPCXTYPE_CXSMPL @@ -82,7 +87,7 @@ namespace mgOnGpu /* clang-format off */ using mgOnGpu::cxsmpl; // Printout to stream for user defined types -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -92,7 +97,7 @@ namespace mg5amcCpu inline __host__ std::ostream& operator<<( std::ostream& out, const cxsmpl& c ) { - out << std::complex( c.real(), c.imag() ); + out << std::complex( c.real(), c.imag() ); return out; } @@ -215,14 +220,14 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { // --- Type definitions (complex type: cxtype) -#ifdef __CUDACC__ // cuda +#ifdef __CUDACC__ // this must be __CUDAC__ (not MGONGPUCPP_GPUIMPL) #if defined MGONGPU_CUCXTYPE_THRUST typedef thrust::complex cxtype; #elif defined MGONGPU_CUCXTYPE_CUCOMPLEX @@ -255,13 +260,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //------------------------------ // CUDA or C++ - using cxsmpl @@ -303,11 +308,11 @@ namespace mg5amcCpu return cxmake( c.real(), c.imag() ); } -#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL +#endif // #if defined MGONGPU_CUCXTYPE_CXSMPL or defined MGONGPU_HIPCXTYPE_CXSMPL or defined MGONGPU_CPPCXTYPE_CXSMPL //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_THRUST // cuda + thrust (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using thrust::complex @@ -347,7 +352,7 @@ namespace mg5amcCpu //========================================================================== -#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex +#if defined __CUDACC__ and defined MGONGPU_CUCXTYPE_CUCOMPLEX // cuda + cucomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // CUDA - using cuComplex @@ -566,7 +571,7 @@ namespace mg5amcCpu //========================================================================== -#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++ + stdcomplex +#if not defined __CUDACC__ and defined MGONGPU_CPPCXTYPE_STDCOMPLEX // c++/hip + stdcomplex (this must be __CUDAC__ and not MGONGPUCPP_GPUIMPL) //------------------------------ // C++ - using std::complex @@ -633,7 +638,7 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h index 905c97d700..fa3a02664b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuFptypes.h @@ -1,7 +1,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Created by: A. Valassi (Jan 2022) for the MG5aMC CUDACPP plugin. -// Further modified by: A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: J. Teig, A. Valassi (2022-2023) for the MG5aMC CUDACPP plugin. #ifndef MGONGPUFPTYPES_H #define MGONGPUFPTYPES_H 1 @@ -12,7 +12,7 @@ #include // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL // cuda namespace mg5amcGpu #else namespace mg5amcCpu @@ -20,7 +20,7 @@ namespace mg5amcCpu { //========================================================================== -#ifdef __CUDACC__ // cuda +#ifdef MGONGPUCPP_GPUIMPL // cuda //------------------------------ // Floating point types - Cuda @@ -64,11 +64,11 @@ namespace mg5amcCpu #endif } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL //------------------------------ // Floating point types - C++ @@ -92,7 +92,7 @@ namespace mg5amcCpu return std::sqrt( f ); } -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== diff --git a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h index e1299ba81e..cdae04326b 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/mgOnGpuVectors.h @@ -32,7 +32,7 @@ #endif // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -131,7 +131,7 @@ namespace mg5amcCpu #endif #endif -#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef __CUDACC__) +#else // i.e #ifndef MGONGPU_CPPSIMD (this includes #ifdef MGONGPUCPP_GPUIMPL) const int neppV = 1; @@ -153,13 +153,13 @@ namespace mg5amcCpu //========================================================================== // NB: namespaces mg5amcGpu and mg5amcCpu includes types which are defined in different ways for CPU and GPU builds (see #318 and #725) -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu #endif { -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // Printout to stream for user defined types @@ -805,11 +805,11 @@ namespace mg5amcCpu #endif // #if defined MGONGPU_CPPSIMD and defined MGONGPU_FPTYPE_DOUBLE and defined MGONGPU_FPTYPE2_FLOAT -#endif // #ifndef __CUDACC__ +#endif // #ifndef MGONGPUCPP_GPUIMPL //========================================================================== -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL //------------------------------ // Vector types - CUDA @@ -853,12 +853,12 @@ namespace mg5amcCpu return mask; } -#endif // #ifdef __CUDACC__ +#endif // #ifdef MGONGPUCPP_GPUIMPL //========================================================================== // Scalar-or-vector types: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL typedef bool bool_sv; typedef fptype fptype_sv; typedef fptype2 fptype2_sv; @@ -879,7 +879,7 @@ namespace mg5amcCpu #endif // Scalar-or-vector zeros: scalar in CUDA, vector or scalar in C++ -#ifdef __CUDACC__ /* clang-format off */ +#ifdef MGONGPUCPP_GPUIMPL /* clang-format off */ inline __host__ __device__ cxtype cxzero_sv(){ return cxtype( 0, 0 ); } #elif defined MGONGPU_CPPSIMD inline cxtype_v cxzero_sv() { return cxtype_v(); } // RRRR=0000 IIII=0000 diff --git a/epochX/cudacpp/pp_tt012j.mad/src/rambo.h b/epochX/cudacpp/pp_tt012j.mad/src/rambo.h index e02ea52496..cd7e1008ea 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/rambo.h +++ b/epochX/cudacpp/pp_tt012j.mad/src/rambo.h @@ -4,7 +4,7 @@ // Copyright (C) 2020-2023 CERN and UCLouvain. // Licensed under the GNU Lesser General Public License (version 3 or later). // Modified by: S. Roiser (Feb 2020) for the MG5aMC CUDACPP plugin. -// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. +// Further modified by: S. Hageboeck, O. Mattelaer, S. Roiser, J. Teig, A. Valassi (2020-2023) for the MG5aMC CUDACPP plugin. //========================================================================== #include "mgOnGpuConfig.h" @@ -18,7 +18,7 @@ #include // Simplified rambo version for 2 to N (with N>=2) processes with massless particles -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL namespace mg5amcGpu #else namespace mg5amcCpu @@ -83,7 +83,7 @@ namespace mg5amcCpu static bool first = true; if( first ) { -#ifdef __CUDACC__ +#ifdef MGONGPUCPP_GPUIMPL if constexpr( M_ACCESS::isOnDevice() ) // avoid { const int ievt0 = 0; @@ -166,7 +166,7 @@ namespace mg5amcCpu wt = po2log; if( nparf != 2 ) wt = ( 2. * nparf - 4. ) * log( energy ) + z[nparf - 1]; -#ifndef __CUDACC__ +#ifndef MGONGPUCPP_GPUIMPL // issue warnings if weight is too small or too large static int iwarn[5] = { 0, 0, 0, 0, 0 }; if( wt < -180. ) diff --git a/epochX/cudacpp/pp_tt012j.mad/src/read_slha.cc b/epochX/cudacpp/pp_tt012j.mad/src/read_slha.cc index 055b19a779..f8e46f2e66 100644 --- a/epochX/cudacpp/pp_tt012j.mad/src/read_slha.cc +++ b/epochX/cudacpp/pp_tt012j.mad/src/read_slha.cc @@ -11,7 +11,11 @@ #include #include -#include +//#ifdef __HIPCC__ +//#include // see https://rocm.docs.amd.com/en/docs-5.4.3/CHANGELOG.html#id79 +//#else +//#include // bypass this completely to ease portability on LUMI #803 +//#endif #include #include @@ -60,7 +64,14 @@ SLHAReader::read_slha_file( std::string file_name, bool verbose ) { std::cout << "WARNING! Card file '" << file_name << "' does not exist:" << " look for the file in directory $" << envpath << "='" << getenv( envpath ) << "'" << std::endl; + /* +#ifdef __HIPCC__ + const std::string file_name2 = std::experimental::filesystem::path( getenv( envpath ) ) / std::experimental::filesystem::path( file_name ).filename(); +#else const std::string file_name2 = std::filesystem::path( getenv( envpath ) ) / std::filesystem::path( file_name ).filename(); +#endif + */ + const std::string file_name2 = std::string( getenv( envpath ) ) + "/" + file_name; // bypass std::filesystem #803 param_card.open( file_name2.c_str(), std::ifstream::in ); if( param_card.good() ) { diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 67b7aa5182..459e70d382 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:35:02 +DATE: 2024-01-30_06:09:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6482s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6399s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.6491s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6403s + [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.31E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1914s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1828s - [COUNTERS] Fortran MEs ( 1 ) : 0.0087s for 8192 events => throughput is 9.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1850s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s + [COUNTERS] Fortran MEs ( 1 ) : 0.0089s for 8192 events => throughput is 9.22E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4504s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3577s - [COUNTERS] Fortran MEs ( 1 ) : 0.0927s for 90112 events => throughput is 9.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4417s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3482s + [COUNTERS] Fortran MEs ( 1 ) : 0.0935s for 90112 events => throughput is 9.64E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.2021s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1952s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1896s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1825s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 8192 events => throughput is 1.15E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681779) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4477s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0766s for 90112 events => throughput is 1.18E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4377s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3566s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0811s for 90112 events => throughput is 1.11E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.222047e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.115404e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.231835e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.135181e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169064681779] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1890s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1847s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.93E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1854s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1812s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.96E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681776) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169064681779) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3996s + [COUNTERS] PROGRAM TOTAL : 0.4018s [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0454s for 90112 events => throughput is 1.98E+06 events/s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0476s for 90112 events => throughput is 1.89E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813628E-002) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.953284e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.873422e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.976393e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.997339e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1867s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1835s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0032s for 8192 events => throughput is 2.58E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1809s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.62E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3874s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3527s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0347s for 90112 events => throughput is 2.60E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3915s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3558s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 90112 events => throughput is 2.53E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.554304e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.563581e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.675247e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.701936e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1863s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1833s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.70E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1820s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1791s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.77E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3840s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3511s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0329s for 90112 events => throughput is 2.74E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3886s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3550s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0336s for 90112 events => throughput is 2.68E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.686020e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.733041e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.793515e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.837193e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1887s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1850s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1852s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1819s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.50E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3954s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3543s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0411s for 90112 events => throughput is 2.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3986s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3589s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0397s for 90112 events => throughput is 2.27E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813669E-002) differ by less than 2E-14 (0.0) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.045622e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.277912e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.176762e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.388817e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6011s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6006s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6311s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6306s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.64E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813628E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7869s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7816s + [COUNTERS] PROGRAM TOTAL : 0.7893s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7841s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 90112 events => throughput is 1.73E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919904813628E-002) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919904813656E-002) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.145579e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.924722e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.960559e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934037e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.712501e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.691813e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.408619e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.449601e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.718171e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.683159e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.071930e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.033957e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.709644e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.708340e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.143512e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.130631e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 79af27bb3b..161c62cc9b 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -2,9 +2,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/e CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:35:20 +DATE: 2024-01-30_06:09:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6532s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6452s - [COUNTERS] Fortran MEs ( 1 ) : 0.0080s for 8192 events => throughput is 1.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6495s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6408s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.47E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1876s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1795s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1762s + [COUNTERS] Fortran MEs ( 1 ) : 0.0085s for 8192 events => throughput is 9.59E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4462s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3548s - [COUNTERS] Fortran MEs ( 1 ) : 0.0914s for 90112 events => throughput is 9.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4421s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3487s + [COUNTERS] Fortran MEs ( 1 ) : 0.0934s for 90112 events => throughput is 9.65E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747166087172673] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747165492032638] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1980s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1915s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 8192 events => throughput is 1.26E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1909s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1840s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.19E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166087172673) differ by less than 4E-4 (1.369147908381052e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747165492032638) differ by less than 4E-4 (1.6428111293542713e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501907796603360E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501905274264717E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4361s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3625s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0735s for 90112 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4341s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3569s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0773s for 90112 events => throughput is 1.17E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501907796603360E-002) differ by less than 4E-4 (1.3232739060065057e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905274264717E-002) differ by less than 4E-4 (1.5989335488963974e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.253065e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.185144e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.252116e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.215006e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165570339780] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1859s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1834s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1786s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.30E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905322826635E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3765s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3483s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0282s for 90112 events => throughput is 3.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3824s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3531s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0293s for 90112 events => throughput is 3.08E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905322826635E-002) differ by less than 4E-4 (1.5936263453308896e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905322826635E-002) differ by less than 4E-4 (1.5936263464411127e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.183638e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.133589e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.347573e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.329458e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1896s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1874s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0022s for 8192 events => throughput is 3.64E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1817s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1793s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.43E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3808s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3552s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 90112 events => throughput is 3.52E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3770s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0262s for 90112 events => throughput is 3.44E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632103443406e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.438480e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.621496e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.639168e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.716307e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747165593922979] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1913s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1889s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1846s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1824s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.63E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501905316084181E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3804s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3553s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 90112 events => throughput is 3.60E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0248s for 90112 events => throughput is 3.63E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632103443406e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501905316084181E-002) differ by less than 4E-4 (1.5943632114545636e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.503584e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.744053e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.715865e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.971296e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747166440400542] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747166446533123] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1893s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1871s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1822s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1798s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.36E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166440400542) differ by less than 4E-4 (1.20672314918302e-07) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747166446533123) differ by less than 4E-4 (1.2039032049049325e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501908978565555E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0915 [9.1501908990866423E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3848s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3584s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0264s for 90112 events => throughput is 3.42E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3541s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0263s for 90112 events => throughput is 3.42E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501908978565555E-002) differ by less than 4E-4 (1.194100419654731e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501908990866423E-002) differ by less than 4E-4 (1.1927560927826875e-07) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.231359e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.615840e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.611631e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.903927e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747166823487174] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6022s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6017s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.67E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6073s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6068s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.61E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501910542849674E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7749s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7703s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0046s for 90112 events => throughput is 1.95E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7794s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.89E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501910542849674E-002) differ by less than 4E-4 (1.0231439939722975e-07) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501910542849674E-002) differ by less than 4E-4 (1.0231439961927435e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.534234e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.032746e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.793140e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.810870e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.972505e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.874936e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.054646e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.028452e+09 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.870406e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.891915e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.242224e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.234607e+09 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.358520e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.256002e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.400117e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.441320e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 52cbc87cca..f51b70af46 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,11 +15,11 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:35:37 +DATE: 2024-01-30_06:10:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 3893 events (found 7395 events) - [COUNTERS] PROGRAM TOTAL : 0.6492s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6412s - [COUNTERS] Fortran MEs ( 1 ) : 0.0081s for 8192 events => throughput is 1.02E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6504s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6416s + [COUNTERS] Fortran MEs ( 1 ) : 0.0088s for 8192 events => throughput is 9.35E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169064681776] fbridge_mode=0 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1853s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1770s - [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.1846s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1761s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.55E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0915 [9.1501919904813656E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.0915 [9.1501919904813669E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4357s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3463s - [COUNTERS] Fortran MEs ( 1 ) : 0.0895s for 90112 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4424s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3488s + [COUNTERS] Fortran MEs ( 1 ) : 0.0936s for 90112 events => throughput is 9.63E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169074211736] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1959s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1891s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0069s for 8192 events => throughput is 1.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1932s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1857s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.09E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211734) differ by less than 2E-4 (4.382159080051906e-10) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211736) differ by less than 2E-4 (4.3821613004979554e-10) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,27 +167,27 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.4287s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3535s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 90112 events => throughput is 1.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4421s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3600s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0820s for 90112 events => throughput is 1.10E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.181180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.101615e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.190793e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.125271e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2175 [0.21747169074211728] fbridge_mode=1 + [XSECTION] Cross section = 0.2175 [0.21747169074211734] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1904s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1863s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0041s for 8192 events => throughput is 1.99E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1841s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1799s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0042s for 8192 events => throughput is 1.94E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211728) differ by less than 2E-4 (4.382156859605857e-10) +OK! xsec from fortran (0.21747169064681776) and cpp (0.21747169074211734) differ by less than 2E-4 (4.382159080051906e-10) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919915927155E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3979s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0443s for 90112 events => throughput is 2.03E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.4024s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3561s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0464s for 90112 events => throughput is 1.94E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919915927155E-002) differ by less than 2E-4 (1.214564004925478e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.044048e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.982524e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.969679e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.072335e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1875s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1844s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.61E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.46E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3864s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3521s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0343s for 90112 events => throughput is 2.63E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3912s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3545s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0368s for 90112 events => throughput is 2.45E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.2480907680442215e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.597342e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.431608e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.692233e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.633191e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1873s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1842s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.69E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1842s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1811s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.65E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3865s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3532s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0333s for 90112 events => throughput is 2.71E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0343s for 90112 events => throughput is 2.63E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.2480907680442215e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.726998e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.754832e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.851400e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.841477e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169063975949] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.1875s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1837s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0037s for 8192 events => throughput is 2.19E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1840s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1805s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919908700741E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.3941s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3550s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0391s for 90112 events => throughput is 2.31E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3972s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3574s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0398s for 90112 events => throughput is 2.26E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.2480907680442215e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919908700741E-002) differ by less than 2E-4 (4.248068563583729e-11) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.197385e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.333320e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.341038e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.456277e+06 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2175 [0.21747169066587257] fbridge_mode=1 [UNWEIGHT] Wrote 1611 events (found 1616 events) - [COUNTERS] PROGRAM TOTAL : 0.6091s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6086s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.65E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6100s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6095s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.62E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0915 [9.1501919911173610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1803 events (found 1808 events) - [COUNTERS] PROGRAM TOTAL : 0.7703s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7654s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7838s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 90112 events => throughput is 1.74E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.1501919904813656E-002) and cpp (9.1501919911173610E-002) differ by less than 2E-4 (6.95061785904727e-11) +OK! xsec from fortran (9.1501919904813669E-002) and cpp (9.1501919911173610E-002) differ by less than 2E-4 (6.95061785904727e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.964035e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.926990e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.893973e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.883911e+08 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.636276e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.714682e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.478323e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.463238e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.697383e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.709935e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.585042e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.999719e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.701325e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.716961e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.163459e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.154425e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 99bc7401b1..6a2d60f404 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -2,38 +2,38 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make USEBUILDDIR=1 AVX=512y +make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2023-11-24_15:35:54 +DATE: 2024-01-30_06:10:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3685s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3268s - [COUNTERS] Fortran MEs ( 1 ) : 0.0417s for 8192 events => throughput is 1.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4078s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3635s + [COUNTERS] Fortran MEs ( 1 ) : 0.0443s for 8192 events => throughput is 1.85E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2769s - [COUNTERS] Fortran MEs ( 1 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3357s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2918s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7190s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2578s - [COUNTERS] Fortran MEs ( 1 ) : 0.4612s for 90112 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8736s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3937s + [COUNTERS] Fortran MEs ( 1 ) : 0.4799s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3623s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3233s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0391s for 8192 events => throughput is 2.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3701s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3307s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0394s for 8192 events => throughput is 2.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7507s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3257s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4250s for 90112 events => throughput is 2.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8532s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4213s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4319s for 90112 events => throughput is 2.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775372) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.132082e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.120291e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.162141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.118287e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708277600102] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3276s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3055s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3362s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3135s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0226s for 8192 events => throughput is 3.62E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600102) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708277600109) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782291775379] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5423s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2971s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2451s for 90112 events => throughput is 3.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6510s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4023s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2487s for 90112 events => throughput is 3.62E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775379) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.538888e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.657266e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.694418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.745669e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3041s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2904s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0136s for 8192 events => throughput is 6.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3190s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3046s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0144s for 8192 events => throughput is 5.69E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4320s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2832s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1488s for 90112 events => throughput is 6.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5517s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3928s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1589s for 90112 events => throughput is 5.67E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775386) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.020066e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.690369e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.978626e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.998122e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3050s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2930s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0119s for 8192 events => throughput is 6.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3210s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3086s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0123s for 8192 events => throughput is 6.64E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4144s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2795s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1349s for 90112 events => throughput is 6.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5235s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3878s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1357s for 90112 events => throughput is 6.64E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775386) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.660341e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.740357e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.696167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.934348e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3346s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3127s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0220s for 8192 events => throughput is 3.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3284s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3089s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.22E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5228s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2980s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2249s for 90112 events => throughput is 4.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6116s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3942s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2174s for 90112 events => throughput is 4.15E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775386) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.916207e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.190038e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.910036e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.219873e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600109] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7194s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7188s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7212s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7206s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.39E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775386] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223782291775393] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6992s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6928s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8076s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8005s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0071s for 90112 events => throughput is 1.27E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782291775386) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782291775393) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.057278e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.036838e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.698426e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.660860e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.013072e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.989378e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.074632e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.069849e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.996049e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.996952e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.152448e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.150687e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.011908e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.991689e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.029442e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.999027e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 61d36152df..fe11b37e1c 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,9 +1,9 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:36:21 +DATE: 2024-01-30_06:10:49 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3714s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3296s - [COUNTERS] Fortran MEs ( 1 ) : 0.0418s for 8192 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3836s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3391s + [COUNTERS] Fortran MEs ( 1 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3233s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2806s - [COUNTERS] Fortran MEs ( 1 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3381s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2936s + [COUNTERS] Fortran MEs ( 1 ) : 0.0445s for 8192 events => throughput is 1.84E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7540s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2823s - [COUNTERS] Fortran MEs ( 1 ) : 0.4717s for 90112 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8752s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3952s + [COUNTERS] Fortran MEs ( 1 ) : 0.4799s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690706767555099] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690703999052587] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3524s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3167s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0356s for 8192 events => throughput is 2.30E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3627s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3261s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0366s for 8192 events => throughput is 2.24E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690706767555099) differ by less than 4E-4 (3.1663296096162696e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690703999052587) differ by less than 4E-4 (8.971448917094449e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782605295497] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223780103711483] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.6953s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3044s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3909s for 90112 events => throughput is 2.31E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8092s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4081s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4011s for 90112 events => throughput is 2.25E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782605295497) differ by less than 4E-4 (6.782658656945273e-09) +OK! xsec from fortran (46.223782291775372) and cpp (46.223780103711483) differ by less than 4E-4 (4.733632297249102e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.343511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.286506e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.350968e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.292834e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690702885183541] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690699958440689] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3092s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2943s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0149s for 8192 events => throughput is 5.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3194s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3043s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0151s for 8192 events => throughput is 5.42E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690702885183541) differ by less than 4E-4 (1.1307059111231865e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690699958440689) differ by less than 4E-4 (1.744398380187917e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223778858016772] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223776162337749] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4474s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2803s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1671s for 90112 events => throughput is 5.39E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5555s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3898s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1657s for 90112 events => throughput is 5.44E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223778858016772) differ by less than 4E-4 (7.428553927546488e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223776162337749) differ by less than 4E-4 (1.326035499182865e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.223130e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.487444e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.234366e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.523247e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2951s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2868s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0083s for 8192 events => throughput is 9.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3066s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2985s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 8192 events => throughput is 1.00E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690694374060818) differ by less than 4E-4 (2.9153560099359765e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3687s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2793s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0894s for 90112 events => throughput is 1.01E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.4802s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3878s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0925s for 90112 events => throughput is 9.74E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223775951815753) differ by less than 4E-4 (1.3715795843527445e-07) +OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.987231e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.007163e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.011425e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.012972e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690694374060818] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690691653203835] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.2935s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 8192 events => throughput is 1.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3042s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2966s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0076s for 8192 events => throughput is 1.08E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690694374060818) differ by less than 4E-4 (2.9153560099359765e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690691653203835) differ by less than 4E-4 (3.48587741338946e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223775951815753] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223773576247488] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.3676s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2860s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0816s for 90112 events => throughput is 1.10E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.4699s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3862s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 90112 events => throughput is 1.08E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223775951815753) differ by less than 4E-4 (1.3715795843527445e-07) +OK! xsec from fortran (46.223782291775372) and cpp (46.223773576247488) differ by less than 4E-4 (1.885507298071687e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.067146e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.018834e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108316e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.101159e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690698914467276] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690698822141186] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3028s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2920s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0108s for 8192 events => throughput is 7.58E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3023s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0112s for 8192 events => throughput is 7.33E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690698914467276) differ by less than 4E-4 (1.9633033720989346e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690698822141186) differ by less than 4E-4 (1.982662718447159e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223780273983500] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223780266165058] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4045s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2883s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1163s for 90112 events => throughput is 7.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5088s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3876s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1212s for 90112 events => throughput is 7.43E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223780273983500) differ by less than 4E-4 (4.3652677583772004e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223780266165058) differ by less than 4E-4 (4.382182106077437e-08) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.363750e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.590700e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.388485e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.687831e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690703397697980] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690703397697987] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7047s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7042s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.50E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690703397697980) differ by less than 4E-4 (1.0232396019382861e-07) +OK! xsec from fortran (47.690708277600116) and cpp (47.690703397697987) differ by less than 4E-4 (1.0232396008280631e-07) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223786763175951] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7059s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7003s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 90112 events => throughput is 1.63E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8220s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8158s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0061s for 90112 events => throughput is 1.47E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223786763175951) differ by less than 4E-4 (9.673376699659286e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223786763175951) differ by less than 4E-4 (9.673376677454826e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.180729e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.211265e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.252831e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.993599e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.767543e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.733080e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.772780e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.767769e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.790178e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.726692e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.842637e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.882266e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.336409e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.370639e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.382414e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.407782e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 0673f7e59b..a855e5b8c2 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none - +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,15 +15,15 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:36:46 +DATE: 2024-01-30_06:11:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 420 events (found 1577 events) - [COUNTERS] PROGRAM TOTAL : 0.3604s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3188s - [COUNTERS] Fortran MEs ( 1 ) : 0.0416s for 8192 events => throughput is 1.97E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3775s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3336s + [COUNTERS] Fortran MEs ( 1 ) : 0.0439s for 8192 events => throughput is 1.87E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690708277600116] fbridge_mode=0 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3181s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2759s - [COUNTERS] Fortran MEs ( 1 ) : 0.0422s for 8192 events => throughput is 1.94E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3333s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2891s + [COUNTERS] Fortran MEs ( 1 ) : 0.0441s for 8192 events => throughput is 1.86E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223782291775365] fbridge_mode=0 + [XSECTION] Cross section = 46.22 [46.223782291775372] fbridge_mode=0 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7030s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2436s - [COUNTERS] Fortran MEs ( 1 ) : 0.4593s for 90112 events => throughput is 1.96E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8813s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4017s + [COUNTERS] Fortran MEs ( 1 ) : 0.4796s for 90112 events => throughput is 1.88E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709601032019] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3545s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3161s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0384s for 8192 events => throughput is 2.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3684s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3289s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 8192 events => throughput is 2.07E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709601032026) differ by less than 2E-4 (2.7750309383733907e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709601032019) differ by less than 2E-4 (2.77503091616893e-08) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7327s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3085s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4242s for 90112 events => throughput is 2.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8620s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4222s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4398s for 90112 events => throughput is 2.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223783635280988) differ by less than 2E-4 (2.9065246431869696e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ by less than 2E-4 (2.9065245987780486e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.135635e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.094601e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.125893e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.081772e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.69 [47.690709601032026] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3220s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3000s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3338s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3117s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0221s for 8192 events => throughput is 3.70E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783635280988] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783635280974] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5189s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2816s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2374s for 90112 events => throughput is 3.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6431s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3996s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2435s for 90112 events => throughput is 3.70E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223783635280988) differ by less than 2E-4 (2.9065246431869696e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783635280974) differ by less than 2E-4 (2.9065245987780486e-08) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.761231e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.699859e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.742706e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.728578e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3057s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2927s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3207s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3066s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0141s for 8192 events => throughput is 5.81E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709681138244) differ by less than 2E-4 (2.9430012205011735e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4221s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2763s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1458s for 90112 events => throughput is 6.18E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5524s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3953s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1572s for 90112 events => throughput is 5.73E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ by less than 2E-4 (2.9427636771828247e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.050162e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.934952e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.113895e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.894708e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3016s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2899s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3189s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3068s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.74E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709681138244) differ by less than 2E-4 (2.9430012205011735e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.4299s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2973s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1326s for 90112 events => throughput is 6.80E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5411s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4039s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1372s for 90112 events => throughput is 6.57E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ by less than 2E-4 (2.9427636771828247e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.791940e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.847513e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.001376e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.835511e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690709681138244] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690709643441508] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.3188s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2994s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0195s for 8192 events => throughput is 4.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3300s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3106s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0194s for 8192 events => throughput is 4.22E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690709681138244) differ by less than 2E-4 (2.9430012205011735e-08) +OK! xsec from fortran (47.690708277600116) and cpp (47.690709643441508) differ by less than 2E-4 (2.863957027088304e-08) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 46.22 [46.223783652032040] fbridge_mode=1 + [XSECTION] Cross section = 46.22 [46.223783660238851] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.5212s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3007s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2205s for 90112 events => throughput is 4.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6128s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4020s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2108s for 90112 events => throughput is 4.27E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223783652032040) differ by less than 2E-4 (2.9427636771828247e-08) +OK! xsec from fortran (46.223782291775372) and cpp (46.223783660238851) differ by less than 2E-4 (2.9605181861569463e-08) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.985174e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.284904e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.068643e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.343219e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 47.69 [47.690708266690699] fbridge_mode=1 + [XSECTION] Cross section = 47.69 [47.690708266690706] fbridge_mode=1 [UNWEIGHT] Wrote 434 events (found 1125 events) - [COUNTERS] PROGRAM TOTAL : 0.7071s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7066s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.40E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.7257s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7251s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.43E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (47.690708277600116) and cpp (47.690708266690699) differ by less than 2E-4 (2.2875357164053867e-10) +OK! xsec from fortran (47.690708277600116) and cpp (47.690708266690706) differ by less than 2E-4 (2.2875334959593374e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 46.22 [46.223782303744791] fbridge_mode=1 [UNWEIGHT] Wrote 1727 events (found 1732 events) - [COUNTERS] PROGRAM TOTAL : 1.7018s - [COUNTERS] Fortran Overhead ( 0 ) : 1.6953s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.39E+07 events/s + [COUNTERS] PROGRAM TOTAL : 1.8142s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8072s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0070s for 90112 events => throughput is 1.29E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (46.223782291775365) and cpp (46.223782303744791) differ by less than 2E-4 (2.5894508759449764e-10) +OK! xsec from fortran (46.223782291775372) and cpp (46.223782303744791) differ by less than 2E-4 (2.5894508759449764e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.014529e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.009523e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.676393e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.569379e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.016903e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.989648e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.060987e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.069002e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.006671e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.991680e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.140193e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.142771e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.002803e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.993358e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.981810e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.008591e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 1e8a82a6de..ad1d0f839b 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,12 +1,12 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z @@ -18,9 +18,9 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:37:12 +DATE: 2024-01-30_06:11:43 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5593s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2322s - [COUNTERS] Fortran MEs ( 1 ) : 0.3271s for 8192 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5934s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2449s + [COUNTERS] Fortran MEs ( 1 ) : 0.3485s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5556s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2267s - [COUNTERS] Fortran MEs ( 1 ) : 0.3290s for 8192 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5871s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2379s + [COUNTERS] Fortran MEs ( 1 ) : 0.3492s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0132s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4373s - [COUNTERS] Fortran MEs ( 1 ) : 3.5759s for 90112 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3849s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5565s + [COUNTERS] Fortran MEs ( 1 ) : 3.8284s for 90112 events => throughput is 2.35E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196357922470791E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8869s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5534s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3335s for 8192 events => throughput is 2.46E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9218s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5788s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3431s for 8192 events => throughput is 2.39E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470791E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196357922470777E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,27 +167,27 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.5033s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7676s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7357s for 90112 events => throughput is 2.41E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.7045s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9313s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.7731s for 90112 events => throughput is 2.39E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.519174e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.459637e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.514864e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.457759e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470777E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5689s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3958s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1731s for 8192 events => throughput is 4.73E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5930s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4166s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1764s for 8192 events => throughput is 4.64E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.5161s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5979s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.9182s for 90112 events => throughput is 4.70E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.7266s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7761s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.9506s for 90112 events => throughput is 4.62E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (0.0) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.818399e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.735934e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.781347e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.731299e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3971s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3094s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0877s for 8192 events => throughput is 9.34E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4266s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0935s for 8192 events => throughput is 8.76E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.4605s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5052s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9553s for 90112 events => throughput is 9.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6908s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6878s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0029s for 90112 events => throughput is 8.98E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.701055e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.260876e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.597059e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.205229e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3796s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3015s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0781s for 8192 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4089s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3259s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0830s for 8192 events => throughput is 9.87E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3975s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5231s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8744s for 90112 events => throughput is 1.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5356s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6675s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8681s for 90112 events => throughput is 1.04E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.088451e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.066040e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.080860e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071802e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470750E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4429s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3340s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1088s for 8192 events => throughput is 7.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4633s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3521s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1112s for 8192 events => throughput is 7.37E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655541E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.7475s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5382s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2094s for 90112 events => throughput is 7.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.9267s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7103s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2164s for 90112 events => throughput is 7.41E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655541E-002) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655555E-002) differ by less than 2E-14 (2.220446049250313e-16) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.536802e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.521586e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.505191e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.533151e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6621s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6566s + [COUNTERS] PROGRAM TOTAL : 0.6854s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6800s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0055s for 8192 events => throughput is 1.50E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655597E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872077655610E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8669s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8440s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0439s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0206s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 90112 events => throughput is 3.85E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872077655597E-002) differ by less than 2E-14 (4.440892098500626e-16) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872077655610E-002) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.613696e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.630499e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.322649e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.083902e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.665904e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.662154e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.242154e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.243596e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.685072e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.668083e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.252410e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.255740e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.680051e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.633959e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.765757e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.773665e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index c73e05bddd..c17be1788d 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -2,26 +2,26 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:37:54 +DATE: 2024-01-30_06:12:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5542s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2274s - [COUNTERS] Fortran MEs ( 1 ) : 0.3268s for 8192 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5874s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2394s + [COUNTERS] Fortran MEs ( 1 ) : 0.3480s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5511s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2267s - [COUNTERS] Fortran MEs ( 1 ) : 0.3244s for 8192 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5918s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2437s + [COUNTERS] Fortran MEs ( 1 ) : 0.3481s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0510s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4446s - [COUNTERS] Fortran MEs ( 1 ) : 3.6064s for 90112 events => throughput is 2.50E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3901s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5607s + [COUNTERS] Fortran MEs ( 1 ) : 3.8294s for 90112 events => throughput is 2.35E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196349765248158E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196347758884971E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8665s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5402s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3263s for 8192 events => throughput is 2.51E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.8754s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5552s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3203s for 8192 events => throughput is 2.56E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349765248158E-002) differ by less than 4E-4 (8.392518791033865e-08) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196347758884971E-002) differ by less than 4E-4 (1.0456755794585604e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310860767768514E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310858119443913E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.2983s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7336s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.5648s for 90112 events => throughput is 2.53E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.4534s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9123s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5411s for 90112 events => throughput is 2.54E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310860767768514E-002) differ by less than 4E-4 (1.3909440088610836e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310858119443913E-002) differ by less than 4E-4 (1.7166476384833373e-07) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.499402e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.651171e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.491631e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.640512e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196334183509370E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196323434217816E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4217s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0970s for 8192 events => throughput is 8.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4354s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3378s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0976s for 8192 events => throughput is 8.39E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196334183509370E-002) differ by less than 4E-4 (2.4423714939381114e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196323434217816E-002) differ by less than 4E-4 (3.548307125900152e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310847547651041E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310842598054087E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.6020s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5094s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0926s for 90112 events => throughput is 8.25E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.7743s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6927s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0816s for 90112 events => throughput is 8.33E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310847547651041E-002) differ by less than 4E-4 (3.0168172948652483e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842598054087E-002) differ by less than 4E-4 (3.625542406293647e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.590308e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.607319e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.556597e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.623071e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3155s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2706s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0449s for 8192 events => throughput is 1.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3345s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2882s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0463s for 8192 events => throughput is 1.77E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196330801117323E-002) differ by less than 4E-4 (2.790367255034454e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.9588s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4633s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4955s for 90112 events => throughput is 1.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1519s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6391s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5128s for 90112 events => throughput is 1.76E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310847326088065E-002) differ by less than 4E-4 (3.0440661691333304e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.839831e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.801449e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.863334e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.824776e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196330801117323E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196325695161859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3060s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2651s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3234s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2821s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0413s for 8192 events => throughput is 1.98E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196330801117323E-002) differ by less than 4E-4 (2.790367255034454e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196325695161859E-002) differ by less than 4E-4 (3.3156909984288774e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310847326088065E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310842393515825E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8963s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4517s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4446s for 90112 events => throughput is 2.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0864s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6356s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4508s for 90112 events => throughput is 2.00E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310847326088065E-002) differ by less than 4E-4 (3.0440661691333304e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310842393515825E-002) differ by less than 4E-4 (3.650697499857358e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.073522e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.066287e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.038945e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.073001e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196344079460428E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196344080460087E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3327s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2795s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0532s for 8192 events => throughput is 1.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3518s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2972s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0546s for 8192 events => throughput is 1.50E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196344079460428E-002) differ by less than 4E-4 (1.424231383939656e-07) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196344080460087E-002) differ by less than 4E-4 (1.4241285339888776e-07) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310857804286998E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310857813116089E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.0553s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4670s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5882s for 90112 events => throughput is 1.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2586s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6507s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.6080s for 90112 events => throughput is 1.48E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310857804286998E-002) differ by less than 4E-4 (1.7554071418679484e-07) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310857813116089E-002) differ by less than 4E-4 (1.754321300451167e-07) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.535514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.497722e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.548771e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.492408e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196349366365994E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196349366366022E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6530s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6521s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.69E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6751s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6742s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.66E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349366365994E-002) differ by less than 4E-4 (8.802906770188912e-08) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196349366366022E-002) differ by less than 4E-4 (8.802906736882221e-08) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310864949473968E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310864949473954E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8988s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8888s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0101s for 90112 events => throughput is 8.95E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0322s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0221s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0102s for 90112 events => throughput is 8.88E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310864949473968E-002) differ by less than 4E-4 (8.766578696306482e-08) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310864949473954E-002) differ by less than 4E-4 (8.766578729613173e-08) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.308714e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.288695e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.861624e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.864373e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.626812e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.630957e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.413554e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.365812e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.631638e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.633946e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.496116e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.471906e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.503568e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.509519e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.624454e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.624050e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index b4daea9308..daa5ca9a3d 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -16,12 +16,12 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:38:32 +DATE: 2024-01-30_06:13:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 42 events (found 469 events) - [COUNTERS] PROGRAM TOTAL : 0.5629s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2337s - [COUNTERS] Fortran MEs ( 1 ) : 0.3292s for 8192 events => throughput is 2.49E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5921s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2413s + [COUNTERS] Fortran MEs ( 1 ) : 0.3507s for 8192 events => throughput is 2.34E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.0972 [9.7196357922470764E-002] fbridge_mode=0 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5518s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2262s - [COUNTERS] Fortran MEs ( 1 ) : 0.3256s for 8192 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5875s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2393s + [COUNTERS] Fortran MEs ( 1 ) : 0.3482s for 8192 events => throughput is 2.35E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872077655555E-002] fbridge_mode=0 + [XSECTION] Cross section = 0.08131 [8.1310872077655569E-002] fbridge_mode=0 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.0097s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4326s - [COUNTERS] Fortran MEs ( 1 ) : 3.5771s for 90112 events => throughput is 2.52E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3863s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5579s + [COUNTERS] Fortran MEs ( 1 ) : 3.8284s for 90112 events => throughput is 2.35E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358763382007E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358763382021E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.8974s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5586s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3388s for 8192 events => throughput is 2.42E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9324s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3490s for 8192 events => throughput is 2.35E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358763382007E-002) differ by less than 2E-4 (8.651674487936134e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358763382021E-002) differ by less than 2E-4 (8.651674487936134e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,27 +167,27 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872835011053E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 5.5261s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7672s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.7589s for 90112 events => throughput is 2.40E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.7762s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9305s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.8457s for 90112 events => throughput is 2.34E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872835011053E-002) differ by less than 2E-4 (9.31432020401246e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872835011053E-002) differ by less than 2E-4 (9.314319981967856e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.465437e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.405521e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.473065e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.412039e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358804670396E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358804670424E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.5634s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3926s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1708s for 8192 events => throughput is 4.80E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.5895s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4134s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1762s for 8192 events => throughput is 4.65E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358804670396E-002) differ by less than 2E-4 (9.076468021618211e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358804670424E-002) differ by less than 2E-4 (9.076468243662816e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.08131 [8.1310872836789727E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 3.4683s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5927s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8757s for 90112 events => throughput is 4.80E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.7571s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7661s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.9910s for 90112 events => throughput is 4.53E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872836789727E-002) differ by less than 2E-4 (9.336195150311255e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872836789727E-002) differ by less than 2E-4 (9.33619492826665e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.905170e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.812264e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.906590e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.790866e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3954s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3088s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0866s for 8192 events => throughput is 9.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3288s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0887s for 8192 events => throughput is 9.23E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501358E-002) differ by less than 2E-4 (6.831846421917476e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.4529s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4970s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9559s for 90112 events => throughput is 9.43E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.6655s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6760s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9895s for 90112 events => throughput is 9.11E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872708918333E-002) differ by less than 2E-4 (7.763571563401683e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.722917e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.369009e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.639767e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.305282e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358586501358E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.3758s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3002s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0756s for 8192 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3954s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3173s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0781s for 8192 events => throughput is 1.05E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501358E-002) differ by less than 2E-4 (6.831846421917476e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872708918333E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.3291s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4914s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8377s for 90112 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5272s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6689s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8582s for 90112 events => throughput is 1.05E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872708918333E-002) differ by less than 2E-4 (7.763571563401683e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.100825e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.086568e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.120448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.099441e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358757578441E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358586501386E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.4736s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3530s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1206s for 8192 events => throughput is 6.79E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.4677s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3542s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1134s for 8192 events => throughput is 7.22E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358757578441E-002) differ by less than 2E-4 (8.591964695270349e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358586501386E-002) differ by less than 2E-4 (6.831846643962081e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872803699391E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872708918305E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 2.8047s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5601s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.2446s for 90112 events => throughput is 7.24E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.9514s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7052s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.2463s for 90112 events => throughput is 7.23E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872803699391E-002) differ by less than 2E-4 (8.929234462939917e-09) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872708918305E-002) differ by less than 2E-4 (7.763571119312473e-09) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.211134e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.364043e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.316317e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.347069e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.0972 [9.7196358102981245E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.0972 [9.7196358102981231E-002] fbridge_mode=1 [UNWEIGHT] Wrote 41 events (found 467 events) - [COUNTERS] PROGRAM TOTAL : 0.6693s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6639s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6813s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6759s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.52E+06 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358102981245E-002) differ by less than 2E-4 (1.8571733040317895e-09) +OK! xsec from fortran (9.7196357922470764E-002) and cpp (9.7196358102981231E-002) differ by less than 2E-4 (1.8571730819871846e-09) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.08131 [8.1310872068634174E-002] fbridge_mode=1 + [XSECTION] Cross section = 0.08131 [8.1310872068634160E-002] fbridge_mode=1 [UNWEIGHT] Wrote 679 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 1.8539s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8310s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0229s for 90112 events => throughput is 3.94E+06 events/s + [COUNTERS] PROGRAM TOTAL : 2.0449s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0215s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0233s for 90112 events => throughput is 3.86E+06 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (8.1310872077655555E-002) and cpp (8.1310872068634174E-002) differ by less than 2E-4 (1.1094924978749532e-10) +OK! xsec from fortran (8.1310872077655569E-002) and cpp (8.1310872068634160E-002) differ by less than 2E-4 (1.109495828544027e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.622730e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.624283e+06 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.249441e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.218126e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.646288e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.599538e+06 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.232584e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.232652e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.632648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.616286e+06 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.244267e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.243703e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.616374e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.609732e+06 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.720039e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.728637e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 2c14264714..930476d789 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,17 +15,17 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:39:14 +DATE: 2024-01-30_06:13:52 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.5067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2862s - [COUNTERS] Fortran MEs ( 1 ) : 4.2205s for 8192 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7127s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3035s + [COUNTERS] Fortran MEs ( 1 ) : 4.4093s for 8192 events => throughput is 1.86E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.5163s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2798s - [COUNTERS] Fortran MEs ( 1 ) : 4.2365s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7196s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2934s + [COUNTERS] Fortran MEs ( 1 ) : 4.4261s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 48.4845s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9328s - [COUNTERS] Fortran MEs ( 1 ) : 46.5516s for 90112 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.8146s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0980s + [COUNTERS] Fortran MEs ( 1 ) : 48.7166s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.8031s - [COUNTERS] Fortran Overhead ( 0 ) : 4.5022s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.3008s for 8192 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.5114s + [COUNTERS] Fortran Overhead ( 0 ) : 4.8392s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.6722s for 8192 events => throughput is 1.75E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352993E-004) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421150E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 53.6260s - [COUNTERS] Fortran Overhead ( 0 ) : 6.1417s - [COUNTERS] CudaCpp MEs ( 2 ) : 47.4843s for 90112 events => throughput is 1.90E+03 events/s + [COUNTERS] PROGRAM TOTAL : 57.8440s + [COUNTERS] Fortran Overhead ( 0 ) : 6.6165s + [COUNTERS] CudaCpp MEs ( 2 ) : 51.2275s for 90112 events => throughput is 1.76E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421161E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421150E-004) differ by less than 2E-14 (5.551115123125783e-16) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.955415e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.804400e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.957400e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.805886e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352993E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.8602s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5384s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.3219s for 8192 events => throughput is 3.53E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.0600s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6542s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.4058s for 8192 events => throughput is 3.41E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311352993E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421156E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 29.4526s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1609s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.2916s for 90112 events => throughput is 3.56E+03 events/s + [COUNTERS] PROGRAM TOTAL : 30.6354s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3949s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2405s for 90112 events => throughput is 3.43E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421156E-004) differ by less than 2E-14 (2.220446049250313e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.699774e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.603718e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.692908e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.615187e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,13 +286,13 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.2393s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2482s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9911s for 8192 events => throughput is 8.27E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.3469s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3045s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0424s for 8192 events => throughput is 7.86E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.8246s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9004s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.9241s for 90112 events => throughput is 8.25E+03 events/s + [COUNTERS] PROGRAM TOTAL : 14.6028s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0873s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.5154s for 90112 events => throughput is 7.83E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.470220e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.099934e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.480723e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.056733e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,13 +362,13 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.0124s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1378s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8747s for 8192 events => throughput is 9.37E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.1181s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1916s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9265s for 8192 events => throughput is 8.84E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.5302s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7907s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.7395s for 90112 events => throughput is 9.25E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.1007s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9669s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.1338s for 90112 events => throughput is 8.89E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.611254e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.168199e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.611296e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.198606e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,13 +438,13 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311353009E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.4817s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3731s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1087s for 8192 events => throughput is 7.39E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6312s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4569s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1743s for 8192 events => throughput is 6.98E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311353009E-004) differ by less than 2E-14 (6.661338147750939e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.5304s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0362s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.4941s for 90112 events => throughput is 7.21E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.1783s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2363s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.9420s for 90112 events => throughput is 6.96E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (3.3306690738754696e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421158E-004) differ by less than 2E-14 (0.0) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.310034e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.039911e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.219164e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.063323e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,13 +514,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277311352998E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8199s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7868s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8345s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.47E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277311352998E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421161E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725748421166E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.8197s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4552s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3644s for 90112 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.9375s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5725s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3650s for 90112 events => throughput is 2.47E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725748421161E-004) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725748421166E-004) differ by less than 2E-14 (4.440892098500626e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.292727e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.275540e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.508942e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.510631e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.110003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.114953e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.156098e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.167963e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.100637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.105704e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.153684e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.168745e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.109892e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.099883e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.425957e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.425014e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index dbf6975e6c..5e8ad575df 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -16,16 +16,16 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:43:32 +DATE: 2024-01-30_06:18:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.5240s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s - [COUNTERS] Fortran MEs ( 1 ) : 4.2417s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7226s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2990s + [COUNTERS] Fortran MEs ( 1 ) : 4.4236s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.5067s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2792s - [COUNTERS] Fortran MEs ( 1 ) : 4.2274s for 8192 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7231s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2979s + [COUNTERS] Fortran MEs ( 1 ) : 4.4253s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 48.8698s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9436s - [COUNTERS] Fortran MEs ( 1 ) : 46.9262s for 90112 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.8315s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1047s + [COUNTERS] Fortran MEs ( 1 ) : 48.7267s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277396490802749E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277396352122325E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 8.6169s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3960s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.2209s for 8192 events => throughput is 1.94E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7124s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4408s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2715s for 8192 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277396490802749E-004) differ by less than 4E-4 (3.2852368918590003e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396352122325E-004) differ by less than 4E-4 (3.2814141017745158e-06) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803774602344628E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803774048965294E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 52.7280s - [COUNTERS] Fortran Overhead ( 0 ) : 6.0827s - [COUNTERS] CudaCpp MEs ( 2 ) : 46.6453s for 90112 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.4678s + [COUNTERS] Fortran Overhead ( 0 ) : 6.2349s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.2329s for 90112 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803774602344628E-004) differ by less than 4E-4 (3.0912915247593986e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803774048965294E-004) differ by less than 4E-4 (3.056275773571926e-06) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.998671e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.973797e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.001275e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.974372e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277389126121586E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277387698033752E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.5615s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4108s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1507s for 8192 events => throughput is 7.12E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6573s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4642s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1930s for 8192 events => throughput is 6.87E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277389126121586E-004) differ by less than 4E-4 (3.0822260348450925e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277387698033752E-004) differ by less than 4E-4 (3.0428601303089664e-06) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803771887543366E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803770691658365E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.7555s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0766s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.6789s for 90112 events => throughput is 7.11E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.3220s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2267s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.0952s for 90112 events => throughput is 6.88E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803771887543366E-004) differ by less than 4E-4 (2.9195091675315865e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803770691658365E-004) differ by less than 4E-4 (2.8438380874629132e-06) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.214660e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.126754e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.260665e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.170734e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.2961s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7839s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5123s for 8192 events => throughput is 1.60E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3306s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8070s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5236s for 8192 events => throughput is 1.56E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277390198115864E-004) differ by less than 4E-4 (3.111776055053639e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 8.1063s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4519s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.6543s for 90112 events => throughput is 1.59E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.3793s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5829s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.7964s for 90112 events => throughput is 1.55E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803774416711566E-004) differ by less than 4E-4 (3.079545366491132e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.649408e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.587083e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.639467e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.594558e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277390198115864E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277388844638422E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.1678s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7178s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4500s for 8192 events => throughput is 1.82E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.2103s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7457s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4646s for 8192 events => throughput is 1.76E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277390198115864E-004) differ by less than 4E-4 (3.111776055053639e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277388844638422E-004) differ by less than 4E-4 (3.074466820685018e-06) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803774416711566E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803773310773457E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 7.3020s - [COUNTERS] Fortran Overhead ( 0 ) : 2.3817s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.9203s for 90112 events => throughput is 1.83E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.6352s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5218s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.1134s for 90112 events => throughput is 1.76E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803774416711566E-004) differ by less than 4E-4 (3.079545366491132e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803773310773457E-004) differ by less than 4E-4 (3.0095657856943347e-06) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.885655e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.751909e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.876738e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.760230e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277396394633404E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277396133530942E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 1.3886s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8337s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5549s for 8192 events => throughput is 1.48E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.4750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8777s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5973s for 8192 events => throughput is 1.37E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277396394633404E-004) differ by less than 4E-4 (3.2825859392904277e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277396133530942E-004) differ by less than 4E-4 (3.2753885288450135e-06) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803777741065333E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803777739454609E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 8.7321s - [COUNTERS] Fortran Overhead ( 0 ) : 2.5116s - [COUNTERS] CudaCpp MEs ( 2 ) : 6.2205s for 90112 events => throughput is 1.45E+04 events/s + [COUNTERS] PROGRAM TOTAL : 9.1796s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7011s + [COUNTERS] CudaCpp MEs ( 2 ) : 6.4785s for 90112 events => throughput is 1.39E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803777741065333E-004) differ by less than 4E-4 (3.2898979009932106e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803777739454609E-004) differ by less than 4E-4 (3.2897959809652377e-06) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.496235e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.411903e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.469765e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.410632e+04 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277400478491260E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277400478491265E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.7802s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7588s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.82E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7967s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7754s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.83E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277400478491260E-004) differ by less than 4E-4 (3.3951593780834344e-06) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277400478491265E-004) differ by less than 4E-4 (3.395159378305479e-06) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803779990154892E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.6640s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4295s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2346s for 90112 events => throughput is 3.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7835s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5473s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2361s for 90112 events => throughput is 3.82E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803779990154892E-004) differ by less than 4E-4 (3.4322117830054566e-06) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803779990154892E-004) differ by less than 4E-4 (3.432211783227501e-06) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.580570e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.582485e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.934731e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.942798e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.492770e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.492976e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.733354e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.638150e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.509140e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.493239e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.727290e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.638925e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.483888e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.453709e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.531454e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.527726e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index cfbc1973b9..a372850ebe 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,11 +1,11 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -16,14 +16,14 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:46:56 +DATE: 2024-01-30_06:21:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 48 events (found 439 events) - [COUNTERS] PROGRAM TOTAL : 4.5513s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2808s - [COUNTERS] Fortran MEs ( 1 ) : 4.2704s for 8192 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7321s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3006s + [COUNTERS] Fortran MEs ( 1 ) : 4.4315s for 8192 events => throughput is 1.85E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277311352988E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.0003628 [3.6277277311352982E-004] fbridge_mode=0 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.5272s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2803s - [COUNTERS] Fortran MEs ( 1 ) : 4.2469s for 8192 events => throughput is 1.93E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7365s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2954s + [COUNTERS] Fortran MEs ( 1 ) : 4.4410s for 8192 events => throughput is 1.84E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725748421164E-004] fbridge_mode=0 + [XSECTION] Cross section = 0.000158 [1.5803725748421158E-004] fbridge_mode=0 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 48.8098s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9495s - [COUNTERS] Fortran MEs ( 1 ) : 46.8603s for 90112 events => throughput is 1.92E+03 events/s + [COUNTERS] PROGRAM TOTAL : 50.8810s + [COUNTERS] Fortran Overhead ( 0 ) : 2.1085s + [COUNTERS] Fortran MEs ( 1 ) : 48.7725s for 90112 events => throughput is 1.85E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,13 +134,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277432965013E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 9.0359s - [COUNTERS] Fortran Overhead ( 0 ) : 4.6142s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.4217s for 8192 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 9.6340s + [COUNTERS] Fortran Overhead ( 0 ) : 4.9027s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7313s for 8192 events => throughput is 1.73E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277432965013E-004) differ by less than 2E-4 (3.352291999547674e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277432965013E-004) differ by less than 2E-4 (3.352291999547674e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725813026109E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725813026107E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 54.8666s - [COUNTERS] Fortran Overhead ( 0 ) : 6.2546s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.6120s for 90112 events => throughput is 1.85E+03 events/s + [COUNTERS] PROGRAM TOTAL : 58.7075s + [COUNTERS] Fortran Overhead ( 0 ) : 6.6716s + [COUNTERS] CudaCpp MEs ( 2 ) : 52.0359s for 90112 events => throughput is 1.73E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725813026109E-004) differ by less than 2E-4 (4.087956639864387e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725813026107E-004) differ by less than 2E-4 (4.087956861908992e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.917331e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.784786e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.930542e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.785841e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277430934464E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277430934459E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 4.7788s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4963s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2825s for 8192 events => throughput is 3.59E+03 events/s + [COUNTERS] PROGRAM TOTAL : 5.0010s + [COUNTERS] Fortran Overhead ( 0 ) : 2.6173s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.3837s for 8192 events => throughput is 3.44E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277430934464E-004) differ by less than 2E-4 (3.296318995538172e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277430934459E-004) differ by less than 2E-4 (3.296318995538172e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.000158 [1.5803725816246317E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.000158 [1.5803725816246315E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 29.3040s - [COUNTERS] Fortran Overhead ( 0 ) : 4.1597s - [COUNTERS] CudaCpp MEs ( 2 ) : 25.1443s for 90112 events => throughput is 3.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 30.6721s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4448s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2273s for 90112 events => throughput is 3.44E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725816246317E-004) differ by less than 2E-4 (4.291719202242916e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725816246315E-004) differ by less than 2E-4 (4.291719424287521e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.675549e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.519557e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.694435e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.539213e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,13 +286,13 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.2311s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2435s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.9875s for 8192 events => throughput is 8.30E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.3362s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3065s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0298s for 8192 events => throughput is 7.96E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861753070292707e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 13.8690s - [COUNTERS] Fortran Overhead ( 0 ) : 2.9050s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.9640s for 90112 events => throughput is 8.22E+03 events/s + [COUNTERS] PROGRAM TOTAL : 14.5206s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0822s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.4384s for 90112 events => throughput is 7.88E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155535589606e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.536816e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.081577e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.511555e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.110336e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,13 +362,13 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.0078s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1337s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8741s for 8192 events => throughput is 9.37E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.0953s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1807s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9146s for 8192 events => throughput is 8.96E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861753070292707e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 12.4367s - [COUNTERS] Fortran Overhead ( 0 ) : 2.7931s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.6436s for 90112 events => throughput is 9.34E+03 events/s + [COUNTERS] PROGRAM TOTAL : 13.0092s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9623s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.0469s for 90112 events => throughput is 8.97E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155535589606e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.658673e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.305280e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.653676e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.278381e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,13 +438,13 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0003628 [3.6277277419683297E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 2.5071s - [COUNTERS] Fortran Overhead ( 0 ) : 1.3899s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.1172s for 8192 events => throughput is 7.33E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.6707s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4755s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1952s for 8192 events => throughput is 6.85E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861753070292707e-09) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277419683297E-004) differ by less than 2E-4 (2.9861755290738756e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725810769321E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 15.4633s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0792s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.3840s for 90112 events => throughput is 7.28E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.3107s + [COUNTERS] Fortran Overhead ( 0 ) : 3.2346s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.0761s for 90112 events => throughput is 6.89E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155535589606e-09) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725810769321E-004) differ by less than 2E-4 (3.945155979678816e-09) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.401661e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.979300e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.428706e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.981216e+03 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 2 - [XSECTION] Cross section = 0.0003628 [3.6277277293084707E-004] fbridge_mode=1 + [XSECTION] Cross section = 0.0003628 [3.6277277293084701E-004] fbridge_mode=1 [UNWEIGHT] Wrote 59 events (found 420 events) - [COUNTERS] PROGRAM TOTAL : 0.8222s - [COUNTERS] Fortran Overhead ( 0 ) : 0.7891s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8344s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8014s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (3.6277277311352988E-004) and cpp (3.6277277293084707E-004) differ by less than 2E-4 (5.03573627241849e-10) +OK! xsec from fortran (3.6277277311352982E-004) and cpp (3.6277277293084701E-004) differ by less than 2E-4 (5.03573627241849e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_ [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000158 [1.5803725738731039E-004] fbridge_mode=1 [UNWEIGHT] Wrote 207 events (found 1235 events) - [COUNTERS] PROGRAM TOTAL : 2.8170s - [COUNTERS] Fortran Overhead ( 0 ) : 2.4535s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3635s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.9283s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5651s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3632s for 90112 events => throughput is 2.48E+05 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.5803725748421164E-004) and cpp (1.5803725738731039E-004) differ by less than 2E-4 (6.131544161291913e-10) +OK! xsec from fortran (1.5803725748421158E-004) and cpp (1.5803725738731039E-004) differ by less than 2E-4 (6.131540830622839e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.293320e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.286633e+05 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.523576e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.522365e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.111282e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.122442e+05 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.159178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.148040e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.106507e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.112221e+05 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.157638e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.164785e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.099330e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.108117e+05 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.430536e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.430780e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 4f16911127..bc47a109df 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,12 +1,12 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z @@ -15,10 +15,10 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -27,13 +27,13 @@ make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2023-11-24_15:52:44 +DATE: 2024-01-30_06:27:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 98.5141s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4738s - [COUNTERS] Fortran MEs ( 1 ) : 98.0403s for 8192 events => throughput is 8.36E+01 events/s + [COUNTERS] PROGRAM TOTAL : 101.9143s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4823s + [COUNTERS] Fortran MEs ( 1 ) : 101.4320s for 8192 events => throughput is 8.08E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 98.4757s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4682s - [COUNTERS] Fortran MEs ( 1 ) : 98.0075s for 8192 events => throughput is 8.36E+01 events/s + [COUNTERS] PROGRAM TOTAL : 101.8572s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4740s + [COUNTERS] Fortran MEs ( 1 ) : 101.3832s for 8192 events => throughput is 8.08E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1079.6019s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3004s - [COUNTERS] Fortran MEs ( 1 ) : 1075.3015s for 90112 events => throughput is 8.38E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1118.0575s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3730s + [COUNTERS] Fortran MEs ( 1 ) : 1113.6844s for 90112 events => throughput is 8.09E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,13 +134,13 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 225.7194s - [COUNTERS] Fortran Overhead ( 0 ) : 104.5284s - [COUNTERS] CudaCpp MEs ( 2 ) : 121.1910s for 8192 events => throughput is 6.76E+01 events/s + [COUNTERS] PROGRAM TOTAL : 222.3358s + [COUNTERS] Fortran Overhead ( 0 ) : 102.7450s + [COUNTERS] CudaCpp MEs ( 2 ) : 119.5908s for 8192 events => throughput is 6.85E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.220446049250313e-15) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813953E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813950E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1418.6305s - [COUNTERS] Fortran Overhead ( 0 ) : 107.9097s - [COUNTERS] CudaCpp MEs ( 2 ) : 1310.7208s for 90112 events => throughput is 6.87E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1439.6055s + [COUNTERS] Fortran Overhead ( 0 ) : 107.7197s + [COUNTERS] CudaCpp MEs ( 2 ) : 1331.8857s for 90112 events => throughput is 6.77E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813953E-007) differ by less than 2E-14 (1.1102230246251565e-15) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813950E-007) differ by less than 2E-14 (1.3322676295501878e-15) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.929728e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.948640e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.986885e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.570768e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435831E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 110.1960s - [COUNTERS] Fortran Overhead ( 0 ) : 50.7639s - [COUNTERS] CudaCpp MEs ( 2 ) : 59.4321s for 8192 events => throughput is 1.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 115.4155s + [COUNTERS] Fortran Overhead ( 0 ) : 52.8722s + [COUNTERS] CudaCpp MEs ( 2 ) : 62.5433s for 8192 events => throughput is 1.31E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (2.220446049250313e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435831E-006) differ by less than 2E-14 (2.220446049250313e-15) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 706.5283s - [COUNTERS] Fortran Overhead ( 0 ) : 54.7734s - [COUNTERS] CudaCpp MEs ( 2 ) : 651.7549s for 90112 events => throughput is 1.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 742.6112s + [COUNTERS] Fortran Overhead ( 0 ) : 56.7177s + [COUNTERS] CudaCpp MEs ( 2 ) : 685.8936s for 90112 events => throughput is 1.31E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.649511e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.569503e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.642121e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.568080e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 51.5856s - [COUNTERS] Fortran Overhead ( 0 ) : 23.9016s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.6840s for 8192 events => throughput is 2.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 53.0772s + [COUNTERS] Fortran Overhead ( 0 ) : 24.7915s + [COUNTERS] CudaCpp MEs ( 2 ) : 28.2857s for 8192 events => throughput is 2.90E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 331.4778s - [COUNTERS] Fortran Overhead ( 0 ) : 27.5520s - [COUNTERS] CudaCpp MEs ( 2 ) : 303.9259s for 90112 events => throughput is 2.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 340.2848s + [COUNTERS] Fortran Overhead ( 0 ) : 28.6537s + [COUNTERS] CudaCpp MEs ( 2 ) : 311.6311s for 90112 events => throughput is 2.89E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.538462e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.398486e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.559381e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.399363e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 45.7105s - [COUNTERS] Fortran Overhead ( 0 ) : 20.8116s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.8989s for 8192 events => throughput is 3.29E+02 events/s + [COUNTERS] PROGRAM TOTAL : 46.8647s + [COUNTERS] Fortran Overhead ( 0 ) : 21.6683s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.1965s for 8192 events => throughput is 3.25E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 299.0798s - [COUNTERS] Fortran Overhead ( 0 ) : 24.6512s - [COUNTERS] CudaCpp MEs ( 2 ) : 274.4286s for 90112 events => throughput is 3.28E+02 events/s + [COUNTERS] PROGRAM TOTAL : 302.3706s + [COUNTERS] Fortran Overhead ( 0 ) : 25.4811s + [COUNTERS] CudaCpp MEs ( 2 ) : 276.8895s for 90112 events => throughput is 3.25E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.987523e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.882449e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.957972e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.889860e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435827E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 47.6904s - [COUNTERS] Fortran Overhead ( 0 ) : 23.3628s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.3276s for 8192 events => throughput is 3.37E+02 events/s + [COUNTERS] PROGRAM TOTAL : 50.0231s + [COUNTERS] Fortran Overhead ( 0 ) : 24.6335s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.3895s for 8192 events => throughput is 3.23E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (2.4424906541753444e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435827E-006) differ by less than 2E-14 (1.7763568394002505e-15) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 293.8850s - [COUNTERS] Fortran Overhead ( 0 ) : 27.2969s - [COUNTERS] CudaCpp MEs ( 2 ) : 266.5881s for 90112 events => throughput is 3.38E+02 events/s + [COUNTERS] PROGRAM TOTAL : 308.5181s + [COUNTERS] Fortran Overhead ( 0 ) : 28.4720s + [COUNTERS] CudaCpp MEs ( 2 ) : 280.0461s for 90112 events => throughput is 3.22E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (9.992007221626409e-16) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.560488e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.386684e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.577427e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.385729e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435838E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435829E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 4.2450s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1602s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.0848s for 8192 events => throughput is 7.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.2467s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1625s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0842s for 8192 events => throughput is 7.56E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100945435838E-006) differ by less than 2E-14 (3.1086244689504383e-15) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100945435829E-006) differ by less than 2E-14 (1.9984014443252818e-15) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813958E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813960E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 18.7885s - [COUNTERS] Fortran Overhead ( 0 ) : 6.9034s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.8851s for 90112 events => throughput is 7.58E+03 events/s + [COUNTERS] PROGRAM TOTAL : 18.9518s + [COUNTERS] Fortran Overhead ( 0 ) : 7.0338s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9179s for 90112 events => throughput is 7.56E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436158813958E-007) differ by less than 2E-14 (8.881784197001252e-16) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436158813960E-007) differ by less than 2E-14 (8.881784197001252e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.552834e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.528868e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.253212e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.249701e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.219027e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.231891e+03 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.567176e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.557033e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.273376e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.244700e+03 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.442640e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.446855e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.208706e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.214530e+03 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.238633e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.244468e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 8e8ecf354b..c35aa0a017 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -3,8 +3,8 @@ CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -16,12 +16,12 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_17:21:33 +DATE: 2024-01-30_07:59:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 98.7413s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4669s - [COUNTERS] Fortran MEs ( 1 ) : 98.2745s for 8192 events => throughput is 8.34E+01 events/s + [COUNTERS] PROGRAM TOTAL : 101.8466s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4779s + [COUNTERS] Fortran MEs ( 1 ) : 101.3687s for 8192 events => throughput is 8.08E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 98.9213s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4778s - [COUNTERS] Fortran MEs ( 1 ) : 98.4435s for 8192 events => throughput is 8.32E+01 events/s + [COUNTERS] PROGRAM TOTAL : 101.7818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4752s + [COUNTERS] Fortran MEs ( 1 ) : 101.3066s for 8192 events => throughput is 8.09E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1080.6656s - [COUNTERS] Fortran Overhead ( 0 ) : 4.3073s - [COUNTERS] Fortran MEs ( 1 ) : 1076.3584s for 90112 events => throughput is 8.37E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1118.6550s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3767s + [COUNTERS] Fortran MEs ( 1 ) : 1114.2783s for 90112 events => throughput is 8.09E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694768344939596E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694768374083672E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 202.3759s - [COUNTERS] Fortran Overhead ( 0 ) : 92.2956s - [COUNTERS] CudaCpp MEs ( 2 ) : 110.0803s for 8192 events => throughput is 7.44E+01 events/s + [COUNTERS] PROGRAM TOTAL : 205.5797s + [COUNTERS] Fortran Overhead ( 0 ) : 95.6059s + [COUNTERS] CudaCpp MEs ( 2 ) : 109.9738s for 8192 events => throughput is 7.45E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694768344939596E-006) differ by less than 4E-4 (0.00014259686216466783) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768374083672E-006) differ by less than 4E-4 (0.00014259935458071915) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361436150871156E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361435710758843E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1304.1851s - [COUNTERS] Fortran Overhead ( 0 ) : 97.1400s - [COUNTERS] CudaCpp MEs ( 2 ) : 1207.0450s for 90112 events => throughput is 7.47E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1305.3326s + [COUNTERS] Fortran Overhead ( 0 ) : 98.5377s + [COUNTERS] CudaCpp MEs ( 2 ) : 1206.7949s for 90112 events => throughput is 7.47E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361436150871156E-007) differ by less than 4E-4 (0.00014045934987350073) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435710758843E-007) differ by less than 4E-4 (0.0001404387438554977) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.937347e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.692219e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.901357e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.699275e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694765850750953E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694765360831655E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 50.8509s - [COUNTERS] Fortran Overhead ( 0 ) : 24.0533s - [COUNTERS] CudaCpp MEs ( 2 ) : 26.7976s for 8192 events => throughput is 3.06E+02 events/s + [COUNTERS] PROGRAM TOTAL : 52.0497s + [COUNTERS] Fortran Overhead ( 0 ) : 24.6638s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.3859s for 8192 events => throughput is 2.99E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694765850750953E-006) differ by less than 4E-4 (0.00014238355787066226) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694765360831655E-006) differ by less than 4E-4 (0.00014234165972015766) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361430669586527E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361429212586563E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 321.9393s - [COUNTERS] Fortran Overhead ( 0 ) : 27.9125s - [COUNTERS] CudaCpp MEs ( 2 ) : 294.0268s for 90112 events => throughput is 3.06E+02 events/s + [COUNTERS] PROGRAM TOTAL : 336.4854s + [COUNTERS] Fortran Overhead ( 0 ) : 29.3396s + [COUNTERS] CudaCpp MEs ( 2 ) : 307.1459s for 90112 events => throughput is 2.93E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430669586527E-007) differ by less than 4E-4 (0.00014020271663550687) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429212586563E-007) differ by less than 4E-4 (0.00014013450003202976) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.520695e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.371429e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.523057e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.391230e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 26.0191s - [COUNTERS] Fortran Overhead ( 0 ) : 12.1224s - [COUNTERS] CudaCpp MEs ( 2 ) : 13.8967s for 8192 events => throughput is 5.89E+02 events/s + [COUNTERS] PROGRAM TOTAL : 27.0721s + [COUNTERS] Fortran Overhead ( 0 ) : 12.6637s + [COUNTERS] CudaCpp MEs ( 2 ) : 14.4085s for 8192 events => throughput is 5.69E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694764951124567E-006) differ by less than 4E-4 (0.00014230662135994443) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 171.5242s - [COUNTERS] Fortran Overhead ( 0 ) : 15.8508s - [COUNTERS] CudaCpp MEs ( 2 ) : 155.6733s for 90112 events => throughput is 5.79E+02 events/s + [COUNTERS] PROGRAM TOTAL : 174.3265s + [COUNTERS] Fortran Overhead ( 0 ) : 16.5080s + [COUNTERS] CudaCpp MEs ( 2 ) : 157.8185s for 90112 events => throughput is 5.71E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430425531218E-007) differ by less than 4E-4 (0.0001401912899885449) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.026908e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.735262e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.019796e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.739370e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694764951124567E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694764906356561E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 23.1132s - [COUNTERS] Fortran Overhead ( 0 ) : 10.7368s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.3764s for 8192 events => throughput is 6.62E+02 events/s + [COUNTERS] PROGRAM TOTAL : 23.8611s + [COUNTERS] Fortran Overhead ( 0 ) : 11.1738s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.6873s for 8192 events => throughput is 6.46E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694764951124567E-006) differ by less than 4E-4 (0.00014230662135994443) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694764906356561E-006) differ by less than 4E-4 (0.0001423027927767162) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361430425531218E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361429111797059E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 151.0249s - [COUNTERS] Fortran Overhead ( 0 ) : 14.5293s - [COUNTERS] CudaCpp MEs ( 2 ) : 136.4957s for 90112 events => throughput is 6.60E+02 events/s + [COUNTERS] PROGRAM TOTAL : 154.4819s + [COUNTERS] Fortran Overhead ( 0 ) : 15.1638s + [COUNTERS] CudaCpp MEs ( 2 ) : 139.3180s for 90112 events => throughput is 6.47E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361430425531218E-007) differ by less than 4E-4 (0.0001401912899885449) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361429111797059E-007) differ by less than 4E-4 (0.00014012978107680318) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.945001e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.672552e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.958337e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.678478e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694767957195604E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694768276769753E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 23.9825s - [COUNTERS] Fortran Overhead ( 0 ) : 11.9109s - [COUNTERS] CudaCpp MEs ( 2 ) : 12.0717s for 8192 events => throughput is 6.79E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.4092s + [COUNTERS] Fortran Overhead ( 0 ) : 12.6834s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.7257s for 8192 events => throughput is 6.44E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694767957195604E-006) differ by less than 4E-4 (0.00014256370209930758) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694768276769753E-006) differ by less than 4E-4 (0.00014259103224434355) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361435956349820E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361435948756818E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 148.9306s - [COUNTERS] Fortran Overhead ( 0 ) : 15.5819s - [COUNTERS] CudaCpp MEs ( 2 ) : 133.3487s for 90112 events => throughput is 6.76E+02 events/s + [COUNTERS] PROGRAM TOTAL : 156.3015s + [COUNTERS] Fortran Overhead ( 0 ) : 16.4754s + [COUNTERS] CudaCpp MEs ( 2 ) : 139.8261s for 90112 events => throughput is 6.44E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361435956349820E-007) differ by less than 4E-4 (0.00014045024240250115) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361435948756818E-007) differ by less than 4E-4 (0.00014044988689865257) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.105766e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.776081e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.113491e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.750726e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1694770708195000E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1694770708194997E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 2.4884s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9881s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.5003s for 8192 events => throughput is 1.64E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5003s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0019s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4984s for 8192 events => throughput is 1.64E+04 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1694770708195000E-006) differ by less than 4E-4 (0.00014279896898083955) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1694770708194997E-006) differ by less than 4E-4 (0.00014279896898039546) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1361443477565659E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1361443477565656E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 11.1878s - [COUNTERS] Fortran Overhead ( 0 ) : 5.7450s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.4428s for 90112 events => throughput is 1.66E+04 events/s + [COUNTERS] PROGRAM TOTAL : 11.2881s + [COUNTERS] Fortran Overhead ( 0 ) : 5.8695s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4186s for 90112 events => throughput is 1.66E+04 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1361443477565659E-007) differ by less than 4E-4 (0.0001408023850304474) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1361443477565656E-007) differ by less than 4E-4 (0.00014080238503022535) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.635622e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.635547e+04 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.625946e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.633264e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.339621e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.309560e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.381945e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.405493e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.320446e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.341562e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.325431e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.341458e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.300828e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.336833e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.393528e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.413620e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 963e0ec416..b9faa14c51 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -4,8 +4,8 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,10 +15,11 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. @@ -27,13 +28,12 @@ make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2023-11-24_18:28:54 +DATE: 2024-01-30_09:07:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg @@ -57,11 +57,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 166 events) - [COUNTERS] PROGRAM TOTAL : 98.6622s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4669s - [COUNTERS] Fortran MEs ( 1 ) : 98.1953s for 8192 events => throughput is 8.34E+01 events/s + [COUNTERS] PROGRAM TOTAL : 101.9697s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4771s + [COUNTERS] Fortran MEs ( 1 ) : 101.4926s for 8192 events => throughput is 8.07E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -82,11 +82,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693100945435802E-006] fbridge_mode=0 + [XSECTION] Cross section = 1.169e-06 [1.1693100945435806E-006] fbridge_mode=0 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 98.6106s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4686s - [COUNTERS] Fortran MEs ( 1 ) : 98.1420s for 8192 events => throughput is 8.35E+01 events/s + [COUNTERS] PROGRAM TOTAL : 101.6914s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4764s + [COUNTERS] Fortran MEs ( 1 ) : 101.2150s for 8192 events => throughput is 8.09E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436158813976E-007] fbridge_mode=0 + [XSECTION] Cross section = 2.136e-07 [2.1358436158813979E-007] fbridge_mode=0 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1080.6388s - [COUNTERS] Fortran Overhead ( 0 ) : 4.2788s - [COUNTERS] Fortran MEs ( 1 ) : 1076.3600s for 90112 events => throughput is 8.37E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1118.2550s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3831s + [COUNTERS] Fortran MEs ( 1 ) : 1113.8719s for 90112 events => throughput is 8.09E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101016896846E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101016896844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 227.6225s - [COUNTERS] Fortran Overhead ( 0 ) : 105.5451s - [COUNTERS] CudaCpp MEs ( 2 ) : 122.0773s for 8192 events => throughput is 6.71E+01 events/s + [COUNTERS] PROGRAM TOTAL : 224.2502s + [COUNTERS] Fortran Overhead ( 0 ) : 103.5045s + [COUNTERS] CudaCpp MEs ( 2 ) : 120.7457s for 8192 events => throughput is 6.78E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101016896846E-006) differ by less than 2E-4 (6.111385175699979e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101016896844E-006) differ by less than 2E-4 (6.1113847316107694e-09) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -167,27 +167,27 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436275882778E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 1457.2560s - [COUNTERS] Fortran Overhead ( 0 ) : 110.3750s - [COUNTERS] CudaCpp MEs ( 2 ) : 1346.8810s for 90112 events => throughput is 6.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1439.9019s + [COUNTERS] Fortran Overhead ( 0 ) : 107.2148s + [COUNTERS] CudaCpp MEs ( 2 ) : 1332.6870s for 90112 events => throughput is 6.76E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436275882778E-007) differ by less than 2E-4 (5.48115042242614e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436275882778E-007) differ by less than 2E-4 (5.48115042242614e-09) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.788273e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.977492e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.741929e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.962894e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -210,13 +210,13 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693101020910778E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 115.6749s - [COUNTERS] Fortran Overhead ( 0 ) : 52.7434s - [COUNTERS] CudaCpp MEs ( 2 ) : 62.9314s for 8192 events => throughput is 1.30E+02 events/s + [COUNTERS] PROGRAM TOTAL : 114.2875s + [COUNTERS] Fortran Overhead ( 0 ) : 54.1504s + [COUNTERS] CudaCpp MEs ( 2 ) : 60.1370s for 8192 events => throughput is 1.36E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101020910778E-006) differ by less than 2E-4 (6.454658807442115e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101020910778E-006) differ by less than 2E-4 (6.454658363352905e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436284111598E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436284111587E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 738.7127s - [COUNTERS] Fortran Overhead ( 0 ) : 56.7745s - [COUNTERS] CudaCpp MEs ( 2 ) : 681.9382s for 90112 events => throughput is 1.32E+02 events/s + [COUNTERS] PROGRAM TOTAL : 713.5498s + [COUNTERS] Fortran Overhead ( 0 ) : 58.1042s + [COUNTERS] CudaCpp MEs ( 2 ) : 655.4456s for 90112 events => throughput is 1.37E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436284111598E-007) differ by less than 2E-4 (5.866422903011426e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436284111587E-007) differ by less than 2E-4 (5.866422458922216e-09) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.576187e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.524488e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.573070e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.529646e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 49.9825s - [COUNTERS] Fortran Overhead ( 0 ) : 22.7532s - [COUNTERS] CudaCpp MEs ( 2 ) : 27.2294s for 8192 events => throughput is 3.01E+02 events/s + [COUNTERS] PROGRAM TOTAL : 50.7719s + [COUNTERS] Fortran Overhead ( 0 ) : 23.4236s + [COUNTERS] CudaCpp MEs ( 2 ) : 27.3483s for 8192 events => throughput is 3.00E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 330.1981s - [COUNTERS] Fortran Overhead ( 0 ) : 26.8024s - [COUNTERS] CudaCpp MEs ( 2 ) : 303.3957s for 90112 events => throughput is 2.97E+02 events/s + [COUNTERS] PROGRAM TOTAL : 326.3386s + [COUNTERS] Fortran Overhead ( 0 ) : 27.1544s + [COUNTERS] CudaCpp MEs ( 2 ) : 299.1842s for 90112 events => throughput is 3.01E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.700163e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.567666e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.716172e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.597479e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 43.9707s - [COUNTERS] Fortran Overhead ( 0 ) : 19.9051s - [COUNTERS] CudaCpp MEs ( 2 ) : 24.0656s for 8192 events => throughput is 3.40E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.3921s + [COUNTERS] Fortran Overhead ( 0 ) : 20.5216s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.8705s for 8192 events => throughput is 3.29E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 289.2968s - [COUNTERS] Fortran Overhead ( 0 ) : 23.7412s - [COUNTERS] CudaCpp MEs ( 2 ) : 265.5556s for 90112 events => throughput is 3.39E+02 events/s + [COUNTERS] PROGRAM TOTAL : 301.9509s + [COUNTERS] Fortran Overhead ( 0 ) : 24.4062s + [COUNTERS] CudaCpp MEs ( 2 ) : 277.5446s for 90112 events => throughput is 3.25E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.209510e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.101718e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.226810e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.081457e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.169e-06 [1.1693101021831071E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.169e-06 [1.1693101021831069E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 46.6653s - [COUNTERS] Fortran Overhead ( 0 ) : 22.7630s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.9023s for 8192 events => throughput is 3.43E+02 events/s + [COUNTERS] PROGRAM TOTAL : 48.9503s + [COUNTERS] Fortran Overhead ( 0 ) : 23.9070s + [COUNTERS] CudaCpp MEs ( 2 ) : 25.0433s for 8192 events => throughput is 3.27E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693101021831071E-006) differ by less than 2E-4 (6.5333627397023974e-09) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693101021831069E-006) differ by less than 2E-4 (6.533362073568583e-09) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.136e-07 [2.1358436281462142E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.136e-07 [2.1358436281462147E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 296.3640s - [COUNTERS] Fortran Overhead ( 0 ) : 26.8877s - [COUNTERS] CudaCpp MEs ( 2 ) : 269.4763s for 90112 events => throughput is 3.34E+02 events/s + [COUNTERS] PROGRAM TOTAL : 301.8849s + [COUNTERS] Fortran Overhead ( 0 ) : 27.8392s + [COUNTERS] CudaCpp MEs ( 2 ) : 274.0457s for 90112 events => throughput is 3.29E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436281462142E-007) differ by less than 2E-4 (5.742375686068613e-09) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436281462147E-007) differ by less than 2E-4 (5.7423759081132175e-09) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.629406e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.501339e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.640208e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.509355e+02 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,13 +514,13 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.169e-06 [1.1693100942770687E-006] fbridge_mode=1 [UNWEIGHT] Wrote 15 events (found 163 events) - [COUNTERS] PROGRAM TOTAL : 3.5613s - [COUNTERS] Fortran Overhead ( 0 ) : 2.6950s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8663s for 8192 events => throughput is 9.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 3.5767s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7138s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8630s for 8192 events => throughput is 9.49E+03 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.1693100945435802E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.2792201459509442e-10) +OK! xsec from fortran (1.1693100945435806E-006) and cpp (1.1693100942770687E-006) differ by less than 2E-4 (2.279223476620018e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -547,56 +547,56 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.136e-07 [2.1358436157495368E-007] fbridge_mode=1 [UNWEIGHT] Wrote 84 events (found 808 events) - [COUNTERS] PROGRAM TOTAL : 15.9559s - [COUNTERS] Fortran Overhead ( 0 ) : 6.4625s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.4934s for 90112 events => throughput is 9.49E+03 events/s + [COUNTERS] PROGRAM TOTAL : 16.0811s + [COUNTERS] Fortran Overhead ( 0 ) : 6.5876s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4935s for 90112 events => throughput is 9.49E+03 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.1358436158813976E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173705990875078e-11) +OK! xsec from fortran (2.1358436158813979E-007) and cpp (2.1358436157495368E-007) differ by less than 2E-4 (6.173717093105324e-11) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.436689e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.427839e+03 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.080384e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.087147e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.108886e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.109838e+04 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 512 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.161919e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.157967e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.113576e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.106789e+04 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.109641e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.114403e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.111246e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111816e+04 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.639262e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.650481e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index f27ee93a59..1fb13570ed 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -3,9 +3,9 @@ CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none - make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:51:16 +DATE: 2024-01-30_06:26:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3170s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2461s - [COUNTERS] Fortran MEs ( 1 ) : 0.0708s for 8192 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3322s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2581s + [COUNTERS] Fortran MEs ( 1 ) : 0.0741s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3076s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2367s - [COUNTERS] Fortran MEs ( 1 ) : 0.0710s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3239s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2500s + [COUNTERS] Fortran MEs ( 1 ) : 0.0739s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2394s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4635s - [COUNTERS] Fortran MEs ( 1 ) : 0.7759s for 90112 events => throughput is 1.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4169s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6004s + [COUNTERS] Fortran MEs ( 1 ) : 0.8166s for 90112 events => throughput is 1.10E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703710] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3933s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3167s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0766s for 8192 events => throughput is 1.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4205s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3380s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0825s for 8192 events => throughput is 9.92E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703716) differ by less than 2E-14 (0.0) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703710) differ by less than 2E-14 (2.220446049250313e-16) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4082s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5618s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8464s for 90112 events => throughput is 1.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6157s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7094s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9063s for 90112 events => throughput is 9.94E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615863) differ by less than 2E-14 (5.551115123125783e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ by less than 2E-14 (0.0) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.081037e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.011343e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.086206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.009603e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2869s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0426s for 8192 events => throughput is 1.92E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2965s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0423s for 8192 events => throughput is 1.94E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -243,27 +243,27 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.0022s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5391s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4630s for 90112 events => throughput is 1.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1314s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6642s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4672s for 90112 events => throughput is 1.93E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615872) differ by less than 2E-14 (1.1102230246251565e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615872) differ by less than 2E-14 (0.0) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.967710e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.951695e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.951729e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.960178e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2892s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2650s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0242s for 8192 events => throughput is 3.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3038s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2788s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0250s for 8192 events => throughput is 3.28E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7692s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5041s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2652s for 90112 events => throughput is 3.40E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9222s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6468s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2754s for 90112 events => throughput is 3.27E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615863) differ by less than 2E-14 (5.551115123125783e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.443138e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.305728e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.417632e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.308522e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2906s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2677s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 8192 events => throughput is 3.59E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2978s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2759s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0219s for 8192 events => throughput is 3.74E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7686s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5286s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2400s for 90112 events => throughput is 3.75E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8887s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6471s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2415s for 90112 events => throughput is 3.73E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615863) differ by less than 2E-14 (5.551115123125783e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.729118e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.792718e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.747676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.861917e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -436,15 +436,15 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333309703727] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3058s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2735s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0323s for 8192 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3207s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2877s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703733) differ by less than 2E-14 (6.661338147750939e-16) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333309703727) differ by less than 2E-14 (4.440892098500626e-16) *** (2-512z) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -469,29 +469,29 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615863] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8682s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5123s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3559s for 90112 events => throughput is 2.53E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0189s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6567s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3622s for 90112 events => throughput is 2.49E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615863) differ by less than 2E-14 (5.551115123125783e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.352575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.535117e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.365058e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.517520e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,8 +514,8 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703733] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6681s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6674s + [COUNTERS] PROGRAM TOTAL : 0.6879s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6872s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182648615869] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9647s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9564s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0083s for 90112 events => throughput is 1.09E+07 events/s + [COUNTERS] PROGRAM TOTAL : 2.0638s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0555s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0083s for 90112 events => throughput is 1.08E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182648615869) differ by less than 2E-14 (2.220446049250313e-16) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182648615869) differ by less than 2E-14 (1.1102230246251565e-16) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.568354e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.535376e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.005124e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.131781e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.387855e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.380880e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.497397e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.511409e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.373048e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.374806e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.771015e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.787335e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.382200e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.380768e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.783427e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.782273e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index f629b5c150..4985f151b2 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -16,12 +16,12 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:51:46 +DATE: 2024-01-30_06:26:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3117s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2399s - [COUNTERS] Fortran MEs ( 1 ) : 0.0718s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3277s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2529s + [COUNTERS] Fortran MEs ( 1 ) : 0.0748s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3099s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2384s - [COUNTERS] Fortran MEs ( 1 ) : 0.0715s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3225s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2485s + [COUNTERS] Fortran MEs ( 1 ) : 0.0740s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2455s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4617s - [COUNTERS] Fortran MEs ( 1 ) : 0.7838s for 90112 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4071s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5926s + [COUNTERS] Fortran MEs ( 1 ) : 0.8145s for 90112 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -132,15 +132,15 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050316058770007] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050314903825744] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3857s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3127s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0731s for 8192 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4015s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3277s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0738s for 8192 events => throughput is 1.11E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050316058770007) differ by less than 4E-4 (6.622154696822591e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050314903825744) differ by less than 4E-4 (7.065505747139156e-07) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -165,29 +165,29 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182797520666] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801181770186087] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.3684s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5619s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8065s for 90112 events => throughput is 1.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5139s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7000s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8139s for 90112 events => throughput is 1.11E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182797520666) differ by less than 4E-4 (6.830124466006282e-09) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801181770186087) differ by less than 4E-4 (4.0292758352045155e-08) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.132102e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131056e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.140766e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131141e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050313133963987] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050310835231938] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2925s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2662s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0263s for 8192 events => throughput is 3.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0269s for 8192 events => throughput is 3.04E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050313133963987) differ by less than 4E-4 (7.744906558304621e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310835231938) differ by less than 4E-4 (8.627325996934943e-07) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801179276862181] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801177817838580] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7892s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5021s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2870s for 90112 events => throughput is 3.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9246s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6468s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2778s for 90112 events => throughput is 3.24E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801179276862181) differ by less than 4E-4 (1.5465921032742358e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177817838580) differ by less than 4E-4 (2.2158326773435988e-07) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.189560e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.299610e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.133717e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.290596e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -284,15 +284,15 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2638s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2508s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0130s for 8192 events => throughput is 6.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2803s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2672s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0131s for 8192 events => throughput is 6.25E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050313344346482) differ by less than 4E-4 (7.664146557395668e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) *** (2-avx2) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -317,29 +317,29 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6261s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4856s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1405s for 90112 events => throughput is 6.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7784s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6325s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1459s for 90112 events => throughput is 6.18E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801179137376883) differ by less than 4E-4 (1.6105727140836024e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.270432e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.309270e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.321323e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.331612e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -360,15 +360,15 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050313344346482] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050310803492405] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2747s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2611s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0135s for 8192 events => throughput is 6.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2783s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2663s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0120s for 8192 events => throughput is 6.84E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050313344346482) differ by less than 4E-4 (7.664146557395668e-07) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050310803492405) differ by less than 4E-4 (8.639509921914978e-07) *** (2-512y) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -393,29 +393,29 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801179137376883] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801177493542723] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.6570s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5241s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1330s for 90112 events => throughput is 6.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7659s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6337s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1322s for 90112 events => throughput is 6.82E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801179137376883) differ by less than 4E-4 (1.6105727140836024e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801177493542723) differ by less than 4E-4 (2.364584175129636e-07) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.825162e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.987405e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.838486e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.079276e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050317064561834] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2742s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2582s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0161s for 8192 events => throughput is 5.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2869s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2697s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0173s for 8192 events => throughput is 4.74E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182143140752] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7047s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5193s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1853s for 90112 events => throughput is 4.86E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8266s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6389s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1877s for 90112 events => throughput is 4.80E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182143140752) differ by less than 4E-4 (2.3185674380421517e-08) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182143140752) differ by less than 4E-4 (2.3185674269399215e-08) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.717607e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.948471e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.789235e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.943070e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -514,9 +514,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050319131407651] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6713s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6707s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.54E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6865s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.55E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** @@ -547,56 +547,56 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801186038252196] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9216s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9152s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.41E+07 events/s + [COUNTERS] PROGRAM TOTAL : 2.0580s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0515s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0065s for 90112 events => throughput is 1.38E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801186038252196) differ by less than 4E-4 (1.5547946996541384e-07) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801186038252196) differ by less than 4E-4 (1.5547946996541384e-07) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.688652e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.646730e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.419660e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.486298e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.806306e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.789487e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.722148e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.699254e+08 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.808789e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.776807e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.809471e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.780761e+08 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.332763e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.361160e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.066283e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.995376e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 8fb8683f4e..44df8a9e3d 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -2,10 +2,10 @@ Working directory (build): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/g CUDACPP_BUILDDIR='.' - make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y @@ -15,13 +15,13 @@ make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. @@ -33,7 +33,7 @@ make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ OMP_NUM_THREADS= -DATE: 2023-11-24_15:52:14 +DATE: 2024-01-30_06:27:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: Working directory (run): /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 78 events (found 561 events) - [COUNTERS] PROGRAM TOTAL : 0.3110s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2397s - [COUNTERS] Fortran MEs ( 1 ) : 0.0713s for 8192 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3260s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2519s + [COUNTERS] Fortran MEs ( 1 ) : 0.0740s for 8192 events => throughput is 1.11E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333309703716] fbridge_mode=0 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3123s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2407s - [COUNTERS] Fortran MEs ( 1 ) : 0.0716s for 8192 events => throughput is 1.14E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3236s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2490s + [COUNTERS] Fortran MEs ( 1 ) : 0.0746s for 8192 events => throughput is 1.10E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -107,11 +107,11 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182648615874] fbridge_mode=0 + [XSECTION] Cross section = 0.218 [0.21801182648615872] fbridge_mode=0 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.2374s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4535s - [COUNTERS] Fortran MEs ( 1 ) : 0.7839s for 90112 events => throughput is 1.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4069s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5940s + [COUNTERS] Fortran MEs ( 1 ) : 0.8130s for 90112 events => throughput is 1.11E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333282657206] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3957s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3178s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0780s for 8192 events => throughput is 1.05E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4196s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3367s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0829s for 8192 events => throughput is 9.88E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -165,29 +165,29 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182636608796] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182636608801] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 2.4134s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5610s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8524s for 90112 events => throughput is 1.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.6225s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7110s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9116s for 90112 events => throughput is 9.89E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182636608796) differ by less than 2E-4 (5.507535538740171e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608801) differ by less than 2E-4 (5.507531097848073e-10) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.065790e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000214e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.078999e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.992747e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -208,15 +208,15 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333282657201] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333282657212] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3246s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2839s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0407s for 8192 events => throughput is 2.01E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3393s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2968s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0425s for 8192 events => throughput is 1.93E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333282657201) differ by less than 2E-4 (1.0382406046005599e-09) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333282657212) differ by less than 2E-4 (1.0382402715336525e-09) *** (2-sse4) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -241,29 +241,29 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182636608810] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182636608804] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9711s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5247s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4464s for 90112 events => throughput is 2.02E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.1463s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6727s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4735s for 90112 events => throughput is 1.90E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182636608810) differ by less than 2E-4 (5.507528877402024e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182636608804) differ by less than 2E-4 (5.507529987625048e-10) *** (2-sse4) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.994361e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.936317e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.977417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.928004e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2878s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2640s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0238s for 8192 events => throughput is 3.45E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3040s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2793s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0246s for 8192 events => throughput is 3.33E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -319,27 +319,27 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7622s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4988s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2634s for 90112 events => throughput is 3.42E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9238s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6513s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2724s for 90112 events => throughput is 3.31E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557156874085422e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) *** (2-avx2) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.380399e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.358469e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.457644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.409112e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.2826s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2612s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0213s for 8192 events => throughput is 3.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.2986s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2774s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0212s for 8192 events => throughput is 3.86E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -395,27 +395,27 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.7310s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4992s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2318s for 90112 events => throughput is 3.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8897s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6536s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2361s for 90112 events => throughput is 3.82E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557156874085422e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) *** (2-512y) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.844575e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.861136e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.903023e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.873456e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2605 [0.26050333291481387] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.3081s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2751s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0330s for 8192 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3231s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2893s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 8192 events => throughput is 2.42E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -471,27 +471,27 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.218 [0.21801182638680733] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.8866s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5159s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3706s for 90112 events => throughput is 2.43E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0365s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6643s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3722s for 90112 events => throughput is 2.42E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557156874085422e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182638680733) differ by less than 2E-4 (4.557155763862397e-10) *** (2-512z) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.345187e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.448669e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.413954e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.422091e+05 ) sec^-1 *** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** -------------------- @@ -512,15 +512,15 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.2605 [0.26050333301029693] fbridge_mode=1 + [XSECTION] Cross section = 0.2605 [0.26050333301029699] fbridge_mode=1 [UNWEIGHT] Wrote 81 events (found 540 events) - [COUNTERS] PROGRAM TOTAL : 0.6718s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6711s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s + [COUNTERS] PROGRAM TOTAL : 0.6889s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6882s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.21E+07 events/s *** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333301029693) differ by less than 2E-4 (3.329716502520341e-10) +OK! xsec from fortran (0.26050333309703716) and cpp (0.26050333301029699) differ by less than 2E-4 (3.329714282074292e-10) *** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -545,58 +545,58 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 0.218 [0.21801182637219935] fbridge_mode=1 + [XSECTION] Cross section = 0.218 [0.21801182637219937] fbridge_mode=1 [UNWEIGHT] Wrote 853 events (found 1849 events) - [COUNTERS] PROGRAM TOTAL : 1.9585s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9503s + [COUNTERS] PROGRAM TOTAL : 2.0663s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0581s [COUNTERS] CudaCpp MEs ( 2 ) : 0.0082s for 90112 events => throughput is 1.10E+07 events/s *** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (0.21801182648615874) and cpp (0.21801182637219935) differ by less than 2E-4 (5.227211996583492e-10) +OK! xsec from fortran (0.21801182648615872) and cpp (0.21801182637219937) differ by less than 2E-4 (5.227208665914418e-10) *** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical *** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.574709e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.534715e+07 ) sec^-1 *** EXECUTE GCHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.987655e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.123919e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.382555e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.382422e+07 ) sec^-1 *** EXECUTE GCHECK(MAX) -p 16384 32 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.503146e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.503129e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.367209e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.385930e+07 ) sec^-1 *** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.807426e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.826918e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.377892e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.379565e+07 ) sec^-1 *** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** -Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.781332e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.789199e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/madX.sh b/epochX/cudacpp/tmad/madX.sh index d03ca9f65f..991ab62ea2 100755 --- a/epochX/cudacpp/tmad/madX.sh +++ b/epochX/cudacpp/tmad/madX.sh @@ -420,7 +420,13 @@ printf "\nOMP_NUM_THREADS=$OMP_NUM_THREADS\n" printf "\nDATE: $(date '+%Y-%m-%d_%H:%M:%S')\n\n" -if nvidia-smi -L > /dev/null 2>&1; then gpuTxt="$(nvidia-smi -L | wc -l)x $(nvidia-smi -L | awk '{print $3,$4}' | sort -u)"; else gpuTxt=none; fi +if nvidia-smi -L > /dev/null 2>&1; then + gpuTxt="$(nvidia-smi -L | wc -l)x $(nvidia-smi -L | awk '{print $3,$4}' | sort -u)" +elif rocm-smi -i > /dev/null 2>&1; then + gpuTxt="$(rocm-smi --showproductname | grep 'Card series' | awk '{print $5,$6,$7}')" +else + gpuTxt=none +fi if [ "${unames}" == "Darwin" ]; then cpuTxt=$(sysctl -h machdep.cpu.brand_string) cpuTxt=${cpuTxt/machdep.cpu.brand_string: } diff --git a/epochX/cudacpp/tput/allTees.sh b/epochX/cudacpp/tput/allTees.sh index f67fa5eccf..9be4f5d4fc 100755 --- a/epochX/cudacpp/tput/allTees.sh +++ b/epochX/cudacpp/tput/allTees.sh @@ -5,12 +5,14 @@ scrdir=$(cd $(dirname $0); pwd) -# By default, use the madevent+cudacpp version of code and tee scripts -sa= +# By default, use the madevent+cudacpp version of code and tee scripts (use -sa to use the standalone version instead) +# By default, build and run all tests (use -makeonly to only build all tests) +opts= suff=".mad" # Parse command line arguments ggttggg=-ggttggg +rndhst=-curhst while [ "$1" != "" ]; do if [ "$1" == "-short" ]; then # Short (no ggttggg) or long version? @@ -22,11 +24,19 @@ while [ "$1" != "" ]; do shift elif [ "$1" == "-sa" ]; then # Use standalone_cudacpp builds instead of madevent+cudacpp? - sa=-sa + opts+=" -sa" suff=".sa" shift + elif [ "$1" == "-makeonly" ]; then + # Only build all tests instead of building and running them? + opts+=" -makeonly" + shift + elif [ "$1" == "-hip" ]; then + # Random numbers use rocrand instead of curand? + rndhst=-rorhst + shift else - echo "Usage: $0 [-short] [-e] [-sa]" + echo "Usage: $0 [-short] [-e] [-sa] [-makeonly] [-hip]" exit 1 fi done @@ -40,7 +50,7 @@ started="STARTED AT $(date)" # (36/78) Six logs (double/float/mixed x hrd0/hrd1 x inl0) in each of the six processes \rm -rf gg_ttggg${suff}/lib/build.none_* -cmd="./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -makeclean ${sa}" +cmd="./tput/teeThroughputX.sh -mix -hrd -makej -eemumu -ggtt -ggttg -ggttgg -gqttq $ggttggg -makeclean ${opts}" $cmd; status=$? ended1="$cmd\nENDED(1) AT $(date) [Status=$status]" tmp1=$(mktemp) @@ -49,29 +59,29 @@ ls -ltr ee_mumu${suff}/lib/build.none_*_inl0_hrd* gg_tt${suff}/lib/build.none_*_ # (48/78) Four extra logs (double/float x hrd0/hrd1 x inl1) only in three of the six processes \rm -rf gg_ttg${suff}/lib/build.none_* \rm -rf gg_ttggg${suff}/lib/build.none_* -cmd="./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ${sa}" +cmd="./tput/teeThroughputX.sh -flt -hrd -makej -eemumu -ggtt -ggttgg -inlonly -makeclean ${opts}" $cmd; status=$? ended2="$cmd\nENDED(2) AT $(date) [Status=$status]" tmp2=$(mktemp) ls -ltr ee_mumu${suff}/lib/build.none_*_inl1_hrd* gg_tt${suff}/lib/build.none_*_inl1_hrd* gg_tt*g${suff}/lib/build.none_*_inl1_hrd* | egrep -v '(total|\./|\.build|_common|^$)' > $tmp2 # (60/78) Two extra logs (double/float x hrd0 x inl0 + bridge) in all six processes (rebuild from cache) -cmd="./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg $ggttggg -flt -bridge -makeclean ${sa}" +cmd="./tput/teeThroughputX.sh -makej -eemumu -ggtt -ggttg -gqttq -ggttgg $ggttggg -flt -bridge -makeclean ${opts}" $cmd; status=$? ended3="$cmd\nENDED(3) AT $(date) [Status=$status]" # (66/78) Two extra logs (double/float x hrd0 x inl0 + rmbhst) only in three of the six processes (no rebuild needed) -cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ${sa}" +cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -rmbhst ${opts}" $cmd; status=$? ended4="$cmd\nENDED(4) AT $(date) [Status=$status]" -# (72/78) Two extra logs (double/float x hrd0 x inl0 + curhst) only in three of the six processes (no rebuild needed) -cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -curhst ${sa}" +# (72/78) Two extra logs (double/float x hrd0 x inl0 + rndhst) only in three of the six processes (no rebuild needed) +cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt ${rndhst} ${opts}" $cmd; status=$? ended5="$cmd\nENDED(5) AT $(date) [Status=$status]" # (78/78) Two extra logs (double/float x hrd0 x inl0 + common) only in three of the six processes (no rebuild needed) -cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ${sa}" +cmd="./tput/teeThroughputX.sh -eemumu -ggtt -ggttgg -flt -common ${opts}" $cmd; status=$? ended6="$cmd\nENDED(6) AT $(date) [Status=$status]" @@ -92,8 +102,8 @@ echo -e "$ended5" if [ "$ggttggg" == "" ]; then echo echo "To complete the test for ggttggg type:" - echo " ./tput/teeThroughputX.sh -flt -hrd -makej -ggttggg -makeclean ${sa}" - echo " ./tput/teeThroughputX.sh -makej -ggttggg -flt -bridge -makeclean ${sa}" + echo " ./tput/teeThroughputX.sh -flt -hrd -makej -ggttggg -makeclean ${opts}" + echo " ./tput/teeThroughputX.sh -makej -ggttggg -flt -bridge -makeclean ${opts}" fi # Print out any errors in the logs diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index 352d1c6fba..15dbd5f8d1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:27:52 +DATE: 2024-01-30_04:51:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.424219e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.270852e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.136825e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.572573e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.281942e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.116391e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.828494 sec - 2,799,509,879 cycles # 2.917 GHz - 4,362,735,667 instructions # 1.56 insn per cycle - 1.170786520 seconds time elapsed +TOTAL : 0.839714 sec + 2,719,217,340 cycles # 2.832 GHz + 4,277,615,433 instructions # 1.57 insn per cycle + 1.175143775 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.137542e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.334591e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.334591e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.879157e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147243e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.926738 sec - 18,326,467,169 cycles # 3.090 GHz - 43,971,697,990 instructions # 2.40 insn per cycle - 5.940094102 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.790847 sec + 19,539,640,504 cycles # 2.876 GHz + 46,935,351,432 instructions # 2.40 insn per cycle + 6.804517518 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.696452e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.222211e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.222211e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.545376e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.021398e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.021398e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.102636 sec - 12,781,292,854 cycles # 3.112 GHz - 30,998,546,187 instructions # 2.43 insn per cycle - 4.125957125 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.488904 sec + 12,869,370,410 cycles # 2.864 GHz + 31,186,180,279 instructions # 2.42 insn per cycle + 4.505888529 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.105968e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.947022e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.947022e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.955981e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.735873e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.735873e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.375754 sec - 10,081,613,505 cycles # 2.982 GHz - 19,365,068,208 instructions # 1.92 insn per cycle - 3.393379455 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 3.633222 sec + 10,032,348,170 cycles # 2.758 GHz + 19,481,701,848 instructions # 1.94 insn per cycle + 3.651370321 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.197053e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.113568e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.113568e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.070263e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.978600e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.978600e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.253132 sec - 9,700,425,131 cycles # 2.977 GHz - 18,987,900,885 instructions # 1.96 insn per cycle - 3.272883868 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +TOTAL : 3.453661 sec + 9,572,367,477 cycles # 2.767 GHz + 18,943,715,958 instructions # 1.98 insn per cycle + 3.473553059 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.888824e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.524315e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.524315e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.819162e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.469996e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.469996e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.721051 sec - 8,612,012,158 cycles # 2.311 GHz - 15,727,858,115 instructions # 1.83 insn per cycle - 3.740799653 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +TOTAL : 3.879359 sec + 8,193,098,191 cycles # 2.110 GHz + 15,513,331,501 instructions # 1.89 insn per cycle + 3.898953032 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index c9fd7402fe..f78ea7251e 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:09:46 +DATE: 2024-01-30_05:45:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.563802e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.504997e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.504997e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.460171e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.485962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.485962e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.276846 sec - 7,462,184,879 cycles # 2.960 GHz - 13,240,340,961 instructions # 1.77 insn per cycle - 2.578354941 seconds time elapsed +TOTAL : 2.319187 sec + 7,341,770,811 cycles # 2.857 GHz + 13,101,723,847 instructions # 1.78 insn per cycle + 2.628471382 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -81,20 +81,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.060089e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.241158e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.241158e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.576223e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107198e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107198e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.541091 sec - 19,576,478,878 cycles # 2.990 GHz - 44,199,334,845 instructions # 2.26 insn per cycle - 6.547828573 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.190455 sec + 20,703,597,440 cycles # 2.877 GHz + 47,160,901,733 instructions # 2.28 insn per cycle + 7.198222207 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -109,20 +109,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.520389e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.958155e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.958155e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.473769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.897978e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.897978e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.763580 sec - 13,999,932,501 cycles # 2.936 GHz - 31,842,828,089 instructions # 2.27 insn per cycle - 4.771019993 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.898106 sec + 14,084,591,919 cycles # 2.873 GHz + 32,028,151,491 instructions # 2.27 insn per cycle + 4.906157596 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -137,20 +137,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.885630e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.569986e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.569986e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.834615e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.502061e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.502061e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.963354 sec - 11,320,178,526 cycles # 2.852 GHz - 20,728,383,013 instructions # 1.83 insn per cycle - 3.970490035 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 4.065584 sec + 11,264,443,170 cycles # 2.767 GHz + 20,844,723,129 instructions # 1.85 insn per cycle + 4.073296839 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -158,27 +158,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.971664e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.715052e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.715052e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.930005e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.695920e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.695920e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.811146 sec - 10,942,360,524 cycles # 2.867 GHz - 20,336,264,592 instructions # 1.86 insn per cycle - 3.818008523 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +TOTAL : 3.900573 sec + 10,821,072,419 cycles # 2.771 GHz + 20,305,054,668 instructions # 1.88 insn per cycle + 3.908355042 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -186,27 +186,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.699997e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.217210e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.217210e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.707724e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.274502e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.274502e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.325158 sec - 9,912,314,209 cycles # 2.289 GHz - 16,872,526,974 instructions # 1.70 insn per cycle - 4.332090544 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +TOTAL : 4.333313 sec + 9,497,951,325 cycles # 2.189 GHz + 16,666,820,850 instructions # 1.75 insn per cycle + 4.341233179 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -214,8 +214,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index 4e10a04e19..f072467bfa 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:23:09 +DATE: 2024-01-30_05:59:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.494137e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.568219e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.090512e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.483909e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.562012e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.071690e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 1.328549 sec - 4,643,802,666 cycles # 2.984 GHz - 7,163,037,263 instructions # 1.54 insn per cycle - 1.615810184 seconds time elapsed +TOTAL : 1.371489 sec + 4,620,404,364 cycles # 2.861 GHz + 7,153,271,516 instructions # 1.55 insn per cycle + 1.672602435 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.098849e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.289504e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.289504e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.952512e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155636e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155636e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 6.492604 sec - 19,420,681,078 cycles # 2.994 GHz - 44,081,202,964 instructions # 2.27 insn per cycle - 6.497991580 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.107803 sec + 20,592,800,911 cycles # 2.895 GHz + 47,037,031,319 instructions # 2.28 insn per cycle + 7.114495241 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.642683e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.149460e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.149460e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.558277e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.038534e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.038534e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.592487 sec - 13,879,570,492 cycles # 3.021 GHz - 31,002,445,042 instructions # 2.23 insn per cycle - 4.597888713 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.822482 sec + 13,870,774,877 cycles # 2.874 GHz + 31,186,249,487 instructions # 2.25 insn per cycle + 4.828845646 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.041509e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.847095e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.847095e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.951724e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.730389e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.730389e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.835804 sec - 11,175,680,204 cycles # 2.910 GHz - 19,267,089,581 instructions # 1.72 insn per cycle - 3.841361823 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 4.015384 sec + 11,119,337,735 cycles # 2.766 GHz + 19,381,852,554 instructions # 1.74 insn per cycle + 4.022009475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.097943e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.989766e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.989766e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.063314e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.951443e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951443e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.768859 sec - 10,910,527,024 cycles # 2.891 GHz - 18,677,308,293 instructions # 1.71 insn per cycle - 3.774878679 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +TOTAL : 3.845408 sec + 10,662,597,452 cycles # 2.769 GHz + 18,643,141,459 instructions # 1.75 insn per cycle + 3.852109381 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.768127e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.357872e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.357872e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.811483e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.460421e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.460421e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.339198 sec - 9,800,415,487 cycles # 2.256 GHz - 15,427,741,546 instructions # 1.57 insn per cycle - 4.345173089 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +TOTAL : 4.272939 sec + 9,279,488,955 cycles # 2.169 GHz + 15,212,537,826 instructions # 1.64 insn per cycle + 4.279485071 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 6f403f72b4..a6db5de426 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:19:52 +DATE: 2024-01-30_05:55:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.511914e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.599963e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.148106e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.492089e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.565509e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.085712e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.984391 sec - 3,583,458,511 cycles # 2.950 GHz - 7,069,877,265 instructions # 1.97 insn per cycle - 1.271392022 seconds time elapsed +TOTAL : 0.999229 sec + 3,503,665,967 cycles # 2.851 GHz + 7,040,796,455 instructions # 2.01 insn per cycle + 1.289089254 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.096082e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.286319e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.286319e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.897604e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152411e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152411e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.151369 sec - 18,323,138,260 cycles # 2.977 GHz - 43,972,152,337 instructions # 2.40 insn per cycle - 6.156982875 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.777699 sec + 19,525,012,140 cycles # 2.879 GHz + 46,935,602,227 instructions # 2.40 insn per cycle + 6.784496054 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.603629e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.100551e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.100551e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.565929e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.046315e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.046315e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.336645 sec - 12,767,737,304 cycles # 2.941 GHz - 30,998,256,840 instructions # 2.43 insn per cycle - 4.342127866 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.431371 sec + 12,844,580,525 cycles # 2.895 GHz + 31,183,505,413 instructions # 2.43 insn per cycle + 4.438022505 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.019068e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.818538e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.818538e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.956069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.738681e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.738681e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.519784 sec - 10,072,637,753 cycles # 2.858 GHz - 19,364,816,632 instructions # 1.92 insn per cycle - 3.525396454 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 3.632695 sec + 10,040,197,478 cycles # 2.761 GHz + 19,480,754,402 instructions # 1.94 insn per cycle + 3.639336589 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.119317e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.001533e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.001533e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.068909e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.973543e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.973543e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.367447 sec - 9,690,886,611 cycles # 2.874 GHz - 18,986,470,372 instructions # 1.96 insn per cycle - 3.372983732 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +TOTAL : 3.456026 sec + 9,583,252,780 cycles # 2.770 GHz + 18,943,299,087 instructions # 1.98 insn per cycle + 3.462550493 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.792369e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.378971e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.378971e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.820163e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.473451e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.473451e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.914886 sec - 8,594,430,788 cycles # 2.193 GHz - 15,726,409,723 instructions # 1.83 insn per cycle - 3.920405262 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +TOTAL : 3.874228 sec + 8,184,248,497 cycles # 2.110 GHz + 15,512,168,002 instructions # 1.90 insn per cycle + 3.880483923 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index d1ecb99501..4dded3e862 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:16:32 +DATE: 2024-01-30_05:52:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.945904e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.504449e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.028089e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.831383e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.529080e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.990768e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 1.911118 sec - 6,389,730,073 cycles # 2.981 GHz - 11,525,444,460 instructions # 1.80 insn per cycle - 2.200099911 seconds time elapsed +TOTAL : 1.936415 sec + 6,196,996,673 cycles # 2.858 GHz + 11,355,646,527 instructions # 1.83 insn per cycle + 2.226164304 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -74,20 +74,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.103959e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.297288e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.297288e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.923680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.152570e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.152570e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.107673 sec - 18,355,628,587 cycles # 3.004 GHz - 43,972,892,726 instructions # 2.40 insn per cycle - 6.113444450 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.755774 sec + 19,508,468,124 cycles # 2.886 GHz + 46,934,079,079 instructions # 2.41 insn per cycle + 6.762162730 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 472) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -101,20 +101,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634128e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.138204e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.138204e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.560350e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.041132e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.041132e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.254868 sec - 12,786,896,134 cycles # 3.002 GHz - 30,998,062,799 instructions # 2.42 insn per cycle - 4.260594817 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.445978 sec + 12,824,682,223 cycles # 2.881 GHz + 31,183,984,467 instructions # 2.43 insn per cycle + 4.452647644 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1626) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -128,20 +128,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.027528e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.835087e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.835087e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.945035e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.719021e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.719021e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.504925 sec - 10,096,961,114 cycles # 2.877 GHz - 19,366,201,509 instructions # 1.92 insn per cycle - 3.510626038 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) +TOTAL : 3.651562 sec + 10,054,417,482 cycles # 2.750 GHz + 19,480,651,159 instructions # 1.94 insn per cycle + 3.658175830 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1964) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -149,26 +149,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.053655e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.901972e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.901972e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.065244e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.964476e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.964476e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.470906 sec - 9,748,601,702 cycles # 2.805 GHz - 18,987,159,627 instructions # 1.95 insn per cycle - 3.476589218 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) +TOTAL : 3.463334 sec + 9,575,609,591 cycles # 2.761 GHz + 18,944,249,093 instructions # 1.98 insn per cycle + 3.469928809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1655) (512y: 161) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,26 +176,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.791558e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.378176e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.378176e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.819790e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.476564e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.476564e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.917961 sec - 8,615,779,869 cycles # 2.196 GHz - 15,727,012,410 instructions # 1.83 insn per cycle - 3.924530110 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) +TOTAL : 3.875473 sec + 8,194,000,405 cycles # 2.112 GHz + 15,512,267,676 instructions # 1.89 insn per cycle + 3.882168596 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 920) (512y: 59) (512z: 1220) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -203,8 +203,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index 0c748b5362..9238de7bbb 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:28:26 +DATE: 2024-01-30_04:52:22 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.428371e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.291622e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.195095e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.433269e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.304294e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.211626e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.698284 sec - 2,831,920,965 cycles # 3.030 GHz - 4,374,282,745 instructions # 1.54 insn per cycle - 1.019007238 seconds time elapsed +TOTAL : 0.708580 sec + 2,678,035,833 cycles # 2.828 GHz + 4,219,258,618 instructions # 1.58 insn per cycle + 1.025396427 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.203126e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.425801e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.425801e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.057712e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.240764e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.240764e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.626130 sec - 17,496,349,043 cycles # 3.107 GHz - 41,817,978,562 instructions # 2.39 insn per cycle - 5.639060633 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.363915 sec + 18,420,155,453 cycles # 2.892 GHz + 44,716,833,361 instructions # 2.43 insn per cycle + 6.376789264 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 486) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.748150e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.309667e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.309667e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.624136e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.147437e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.147437e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.992952 sec - 12,464,968,903 cycles # 3.118 GHz - 30,161,347,268 instructions # 2.42 insn per cycle - 4.011343884 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.286124 sec + 12,429,118,549 cycles # 2.897 GHz + 30,107,231,858 instructions # 2.42 insn per cycle + 4.302706533 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1569) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.129918e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.998050e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.998050e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.942189e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.705004e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.705004e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.345656 sec - 9,985,760,194 cycles # 2.981 GHz - 19,098,128,141 instructions # 1.91 insn per cycle - 3.367255257 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) +TOTAL : 3.656079 sec + 10,127,428,804 cycles # 2.766 GHz + 19,115,519,637 instructions # 1.89 insn per cycle + 3.673885868 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1902) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.214083e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.146132e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.146132e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.094903e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.039710e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.039710e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.227419 sec - 9,634,388,874 cycles # 2.980 GHz - 18,745,812,999 instructions # 1.95 insn per cycle - 3.242688774 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) +TOTAL : 3.417483 sec + 9,477,381,758 cycles # 2.768 GHz + 18,489,351,216 instructions # 1.95 insn per cycle + 3.434681568 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1576) (512y: 159) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.942471e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.616599e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.616599e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.183418e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.193735e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.193735e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.630500 sec - 8,446,217,730 cycles # 2.323 GHz - 15,603,353,222 instructions # 1.85 insn per cycle - 3.648277451 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) +TOTAL : 3.298580 sec + 7,210,521,695 cycles # 2.182 GHz + 13,864,693,183 instructions # 1.92 insn per cycle + 3.315590461 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 818) (512y: 57) (512z: 898) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index 03ec5636b7..09e3552971 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:58:42 +DATE: 2024-01-30_05:33:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.462140e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.622718e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.171485e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.454720e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.590982e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.126095e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.680938 sec - 2,683,046,213 cycles # 2.937 GHz - 4,147,768,853 instructions # 1.55 insn per cycle - 0.974174885 seconds time elapsed +TOTAL : 0.682889 sec + 2,611,559,388 cycles # 2.831 GHz + 3,986,840,129 instructions # 1.53 insn per cycle + 0.986209294 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.640951e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.107305e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.107305e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.350945e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.669369e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.669369e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 4.242929 sec - 12,671,992,752 cycles # 2.983 GHz - 32,514,493,053 instructions # 2.57 insn per cycle - 4.248758803 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.075739 sec + 14,632,134,397 cycles # 2.880 GHz + 36,697,212,873 instructions # 2.51 insn per cycle + 5.082665504 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 707) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.096760e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.006731e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.006731e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.975416e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.812212e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.812212e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.405789 sec - 10,268,358,622 cycles # 3.010 GHz - 24,473,723,768 instructions # 2.38 insn per cycle - 3.411753584 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.599579 sec + 10,391,716,980 cycles # 2.883 GHz + 24,753,509,930 instructions # 2.38 insn per cycle + 3.606361950 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.257873e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.333676e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.333676e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.206864e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.274609e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.274609e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.192822 sec - 9,136,252,150 cycles # 2.858 GHz - 16,922,200,539 instructions # 1.85 insn per cycle - 3.198278524 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) +TOTAL : 3.273737 sec + 8,884,033,270 cycles # 2.722 GHz + 16,960,441,009 instructions # 1.91 insn per cycle + 3.280558312 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.262662e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.358756e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.358756e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.436675e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.780065e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.780065e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.189189 sec - 8,979,407,762 cycles # 2.811 GHz - 16,334,341,152 instructions # 1.82 insn per cycle - 3.195303169 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) +TOTAL : 2.997375 sec + 8,315,936,313 cycles # 2.769 GHz + 16,298,181,743 instructions # 1.96 insn per cycle + 3.004046425 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2403) (512y: 292) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.031223e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.819033e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.819033e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.987391e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.794180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.794180e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.500399 sec - 7,927,835,353 cycles # 2.263 GHz - 14,582,334,434 instructions # 1.84 insn per cycle - 3.505946160 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) +TOTAL : 3.583817 sec + 7,670,874,044 cycles # 2.137 GHz + 14,352,448,248 instructions # 1.87 insn per cycle + 3.590538974 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 892) (512y: 63) (512z: 975) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039868165088E-002 -Relative difference = 1.0277089312025782e-08 +Avg ME (F77/C++) = 1.2828039868165090E-002 +Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index ed59ed96d8..508008a0c5 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:59:12 +DATE: 2024-01-30_05:34:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.466236e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.600756e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.174175e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.464301e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.594213e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.177261e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.677095 sec - 2,664,495,162 cycles # 2.925 GHz - 4,015,268,500 instructions # 1.51 insn per cycle - 0.968065572 seconds time elapsed +TOTAL : 0.680513 sec + 2,594,214,158 cycles # 2.833 GHz + 3,992,420,158 instructions # 1.54 insn per cycle + 0.978034885 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.188114e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.096436e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.096436e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.895468e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.581482e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.581482e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.282515 sec - 9,850,281,434 cycles # 2.996 GHz - 25,394,191,492 instructions # 2.58 insn per cycle - 3.288415370 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.738704 sec + 10,794,188,443 cycles # 2.885 GHz + 28,356,720,092 instructions # 2.63 insn per cycle + 3.745371478 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.427078e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.740655e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.740655e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.231818e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.360148e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.360148e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.000969 sec - 8,962,872,354 cycles # 2.983 GHz - 21,484,742,690 instructions # 2.40 insn per cycle - 3.006873984 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.232648 sec + 9,331,358,518 cycles # 2.882 GHz + 21,587,159,141 instructions # 2.31 insn per cycle + 3.239331570 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2117) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.437736e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.701040e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.701040e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.406271e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.696326e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.696326e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.982872 sec - 8,650,300,725 cycles # 2.896 GHz - 15,811,446,227 instructions # 1.83 insn per cycle - 2.988774005 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) +TOTAL : 3.030114 sec + 8,381,289,955 cycles # 2.761 GHz + 15,943,872,727 instructions # 1.90 insn per cycle + 3.036686774 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.506471e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.822748e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.822748e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.611770e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.211566e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.211566e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 2.907315 sec - 8,443,660,993 cycles # 2.899 GHz - 15,514,495,232 instructions # 1.84 insn per cycle - 2.913215721 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) +TOTAL : 2.823652 sec + 7,834,743,570 cycles # 2.770 GHz + 15,370,444,400 instructions # 1.96 insn per cycle + 2.830226684 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2179) (512y: 307) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.094032e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.959241e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.959241e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.110110e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.044152e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.044152e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.415670 sec - 7,621,535,351 cycles # 2.228 GHz - 14,285,430,897 instructions # 1.87 insn per cycle - 3.421826468 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) +TOTAL : 3.399029 sec + 7,342,854,469 cycles # 2.157 GHz + 13,880,932,107 instructions # 1.89 insn per cycle + 3.405583219 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 853) (512y: 69) (512z: 905) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 5eeb6e403e..30054d0a8f 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:28:58 +DATE: 2024-01-30_04:52:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.089717e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.081644e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.286206e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.089125e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.083340e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.291553e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.586552 sec - 2,420,869,304 cycles # 2.991 GHz - 3,828,073,747 instructions # 1.58 insn per cycle - 0.888297566 seconds time elapsed +TOTAL : 0.592260 sec + 2,336,196,912 cycles # 2.833 GHz + 3,633,132,034 instructions # 1.56 insn per cycle + 0.902800684 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.174500e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.392497e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.392497e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.035118e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.220346e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.220346e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.713477 sec - 17,800,790,292 cycles # 3.113 GHz - 43,512,722,868 instructions # 2.44 insn per cycle - 5.725545534 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.458158 sec + 18,623,778,658 cycles # 2.882 GHz + 47,047,597,520 instructions # 2.53 insn per cycle + 6.468376899 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.407559e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.694429e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.694429e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.220597e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.402817e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.402817e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.969731 sec - 9,247,230,421 cycles # 3.108 GHz - 21,907,456,003 instructions # 2.37 insn per cycle - 2.986090115 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.207438 sec + 9,259,856,985 cycles # 2.882 GHz + 22,093,069,841 instructions # 2.39 insn per cycle + 3.223491423 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.603940e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.990614e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.990614e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.440699e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.781387e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.781387e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.767987 sec - 8,299,418,318 cycles # 2.993 GHz - 15,591,106,391 instructions # 1.88 insn per cycle - 2.786468252 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 2.957121 sec + 8,193,990,799 cycles # 2.766 GHz + 15,625,791,555 instructions # 1.91 insn per cycle + 2.973833384 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.627112e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.068065e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.068065e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.532783e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.026282e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.026282e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.741024 sec - 8,236,041,235 cycles # 2.999 GHz - 15,428,889,211 instructions # 1.87 insn per cycle - 2.759001089 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +TOTAL : 2.864857 sec + 7,877,312,491 cycles # 2.746 GHz + 15,298,553,606 instructions # 1.94 insn per cycle + 2.880238416 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.613778e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.027225e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.027225e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.515538e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.925634e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.925634e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.766843 sec - 6,634,803,505 cycles # 2.394 GHz - 12,864,482,335 instructions # 1.94 insn per cycle - 2.786287293 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +TOTAL : 2.878384 sec + 6,411,016,127 cycles # 2.223 GHz + 12,624,518,195 instructions # 1.97 insn per cycle + 2.897065980 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 77f863e39f..cb0960cef7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:10:24 +DATE: 2024-01-30_05:46:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.062157e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.459463e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.459463e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.896245e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.389243e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.389243e+07 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.708612 sec - 5,739,082,619 cycles # 2.966 GHz - 10,314,110,992 instructions # 1.80 insn per cycle - 1.994013421 seconds time elapsed +TOTAL : 1.734031 sec + 5,668,072,364 cycles # 2.868 GHz + 10,146,395,921 instructions # 1.79 insn per cycle + 2.033339529 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -81,20 +81,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.106417e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.307176e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.307176e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.023723e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.199962e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.199962e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 6.159436 sec - 18,484,427,111 cycles # 2.999 GHz - 43,663,224,156 instructions # 2.36 insn per cycle - 6.165948155 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.631109 sec + 19,198,970,802 cycles # 2.893 GHz + 47,195,604,267 instructions # 2.46 insn per cycle + 6.638520301 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -102,27 +102,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.217277e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320268e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320268e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.130711e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.183569e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.183569e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.327491 sec - 9,999,841,208 cycles # 3.000 GHz - 23,242,551,966 instructions # 2.32 insn per cycle - 3.334092044 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.452422 sec + 9,989,387,225 cycles # 2.889 GHz + 23,431,077,272 instructions # 2.35 insn per cycle + 3.459894158 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -137,20 +137,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.386544e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.570204e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.570204e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.341081e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.547294e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.547294e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.123723 sec - 9,026,914,064 cycles # 2.885 GHz - 16,711,011,073 instructions # 1.85 insn per cycle - 3.130206343 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 3.196012 sec + 8,906,176,925 cycles # 2.782 GHz + 16,751,991,837 instructions # 1.88 insn per cycle + 3.203321936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -165,20 +165,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.374379e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.579230e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.579230e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.434021e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.786427e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.786427e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 3.141266 sec - 8,951,033,856 cycles # 2.844 GHz - 16,554,473,732 instructions # 1.85 insn per cycle - 3.147725016 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +TOTAL : 3.093664 sec + 8,635,370,178 cycles # 2.786 GHz + 16,424,138,356 instructions # 1.90 insn per cycle + 3.101132741 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -193,20 +193,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.422417e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.623038e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.623038e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.383314e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.611676e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.611676e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 3.084070 sec - 7,371,897,160 cycles # 2.386 GHz - 14,070,343,022 instructions # 1.91 insn per cycle - 3.090398779 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +TOTAL : 3.145258 sec + 7,151,980,153 cycles # 2.270 GHz + 13,850,467,115 instructions # 1.94 insn per cycle + 3.152590479 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -214,8 +214,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 94aeb34d20..26c818590d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:23:46 +DATE: 2024-01-30_05:59:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.278888e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156496e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.253231e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.303596e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.175288e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.243996e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 -TOTAL : 1.221248 sec - 4,164,953,848 cycles # 2.869 GHz - 6,574,190,110 instructions # 1.58 insn per cycle - 1.509194129 seconds time elapsed +TOTAL : 1.207260 sec + 4,082,591,214 cycles # 2.858 GHz + 6,515,356,659 instructions # 1.60 insn per cycle + 1.486873600 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.130054e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.340325e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.340325e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.222240e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.222240e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 6.276129 sec - 18,853,916,186 cycles # 3.003 GHz - 43,696,977,700 instructions # 2.32 insn per cycle - 6.281870449 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.777770 sec + 19,569,392,860 cycles # 2.885 GHz + 47,229,099,277 instructions # 2.41 insn per cycle + 6.784024049 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.294770e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.514019e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.514019e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.224011e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.394362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.394362e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 3.444527 sec - 10,256,624,459 cycles # 2.974 GHz - 21,988,381,288 instructions # 2.14 insn per cycle - 3.450359347 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.543713 sec + 10,250,573,649 cycles # 2.890 GHz + 22,173,775,935 instructions # 2.16 insn per cycle + 3.550219999 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.493233e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.822049e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.822049e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.458663e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.813529e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.813529e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.224970 sec - 9,324,398,209 cycles # 2.887 GHz - 15,502,220,245 instructions # 1.66 insn per cycle - 3.230595711 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 3.280080 sec + 9,161,776,432 cycles # 2.789 GHz + 15,536,168,479 instructions # 1.70 insn per cycle + 3.286291256 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.511667e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.890072e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.890072e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.554649e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.077981e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.077981e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.207482 sec - 9,275,437,455 cycles # 2.888 GHz - 15,144,147,371 instructions # 1.63 insn per cycle - 3.213077967 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +TOTAL : 3.189150 sec + 8,891,496,493 cycles # 2.784 GHz + 15,006,164,122 instructions # 1.69 insn per cycle + 3.195486341 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.556471e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.938543e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.938543e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.516232e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.934012e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.934012e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 3.164113 sec - 7,669,667,119 cycles # 2.420 GHz - 12,572,972,519 instructions # 1.64 insn per cycle - 3.169922598 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +TOTAL : 3.229540 sec + 7,432,998,054 cycles # 2.298 GHz + 12,333,053,960 instructions # 1.66 insn per cycle + 3.235962697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index 62bb3359c2..90d7f62db4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:20:27 +DATE: 2024-01-30_05:56:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.313617e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.192806e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.298454e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.305141e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181296e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.274552e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.858273 sec - 3,147,417,725 cycles # 2.925 GHz - 6,398,055,131 instructions # 2.03 insn per cycle - 1.135101998 seconds time elapsed +TOTAL : 0.867173 sec + 3,085,877,327 cycles # 2.830 GHz + 6,333,420,740 instructions # 2.05 insn per cycle + 1.147827940 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.125924e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.336940e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.336940e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.039832e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.222763e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.222763e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.957055 sec - 17,830,147,444 cycles # 2.991 GHz - 43,513,458,674 instructions # 2.44 insn per cycle - 5.962276638 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.429819 sec + 18,561,263,651 cycles # 2.885 GHz + 47,048,334,209 instructions # 2.53 insn per cycle + 6.436326918 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.326147e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.566110e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.566110e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.222730e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.393980e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.393980e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.071304 sec - 9,223,399,519 cycles # 2.999 GHz - 21,906,628,505 instructions # 2.38 insn per cycle - 3.076527254 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.203768 sec + 9,238,443,218 cycles # 2.879 GHz + 22,092,244,938 instructions # 2.39 insn per cycle + 3.210105048 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.495233e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.826120e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.826120e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.418509e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.733909e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.733909e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.884227 sec - 8,304,746,805 cycles # 2.875 GHz - 15,590,862,530 instructions # 1.88 insn per cycle - 2.889527315 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 2.982066 sec + 8,185,679,734 cycles # 2.740 GHz + 15,625,107,028 instructions # 1.91 insn per cycle + 2.988278371 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.543511e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.915375e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.915375e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.558846e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.085053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.085053e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.832113 sec - 8,194,558,683 cycles # 2.889 GHz - 15,428,318,410 instructions # 1.88 insn per cycle - 2.837422327 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +TOTAL : 2.831486 sec + 7,894,514,850 cycles # 2.783 GHz + 15,296,644,493 instructions # 1.94 insn per cycle + 2.837958999 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.541346e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.894800e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.894800e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.525394e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.942507e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.942507e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.839095 sec - 6,610,683,902 cycles # 2.326 GHz - 12,864,156,943 instructions # 1.95 insn per cycle - 2.844410626 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +TOTAL : 2.867904 sec + 6,407,267,092 cycles # 2.230 GHz + 12,623,570,741 instructions # 1.97 insn per cycle + 2.874115235 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 2afd300dfd..91671fa84d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:17:07 +DATE: 2024-01-30_05:53:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.991991e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.144574e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.162617e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.674927e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.142204e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.126513e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 -TOTAL : 1.488342 sec - 5,031,725,760 cycles # 2.945 GHz - 9,217,170,327 instructions # 1.83 insn per cycle - 1.765546553 seconds time elapsed +TOTAL : 1.522177 sec + 5,014,296,377 cycles # 2.858 GHz + 9,135,258,914 instructions # 1.82 insn per cycle + 1.813578794 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -74,20 +74,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.131966e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.341693e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.341693e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.043183e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.226572e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.226572e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.927034 sec - 17,813,349,259 cycles # 3.004 GHz - 43,513,436,757 instructions # 2.44 insn per cycle - 5.932279783 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.408092 sec + 18,567,709,150 cycles # 2.896 GHz + 47,047,255,730 instructions # 2.53 insn per cycle + 6.414419955 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 542) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -95,26 +95,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.333420e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.574664e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.574664e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.231919e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.414648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.414648e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.064718 sec - 9,236,786,325 cycles # 3.009 GHz - 21,906,852,468 instructions # 2.37 insn per cycle - 3.070020479 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.191678 sec + 9,246,166,536 cycles # 2.894 GHz + 22,093,449,321 instructions # 2.39 insn per cycle + 3.197919261 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1883) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -128,20 +128,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.506026e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.849776e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.849776e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.455778e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.806689e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.806689e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.874113 sec - 8,320,499,209 cycles # 2.891 GHz - 15,592,249,654 instructions # 1.87 insn per cycle - 2.879517495 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) +TOTAL : 2.938294 sec + 8,179,243,825 cycles # 2.779 GHz + 15,624,915,954 instructions # 1.91 insn per cycle + 2.944456642 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2619) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -155,20 +155,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.509013e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.864492e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.864492e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.562111e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.082808e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.082808e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.870737 sec - 8,241,489,655 cycles # 2.866 GHz - 15,430,280,733 instructions # 1.87 insn per cycle - 2.876296254 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) +TOTAL : 2.828979 sec + 7,880,998,863 cycles # 2.781 GHz + 15,296,291,599 instructions # 1.94 insn per cycle + 2.835269816 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -182,20 +182,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.501831e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.838293e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.838293e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.528595e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.951135e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.951135e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.887603 sec - 6,626,308,640 cycles # 2.293 GHz - 12,864,606,095 instructions # 1.94 insn per cycle - 2.893234728 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) +TOTAL : 2.864434 sec + 6,402,503,393 cycles # 2.232 GHz + 12,623,594,501 instructions # 1.97 insn per cycle + 2.870718249 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1615) (512y: 12) (512z: 1404) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -203,8 +203,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052585973637E-002 -Relative difference = 2.0158743040564767e-07 +Avg ME (F77/C++) = 1.2828052589611616E-002 +Relative difference = 2.0187102602673518e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index efc03ad0b2..cc5700bb60 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:29:28 +DATE: 2024-01-30_04:53:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.089951e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092950e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.329798e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.091291e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.093645e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.338052e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.581630 sec - 2,429,919,567 cycles # 3.010 GHz - 3,742,142,294 instructions # 1.54 insn per cycle - 0.889170895 seconds time elapsed +TOTAL : 0.585723 sec + 2,310,991,948 cycles # 2.835 GHz + 3,567,792,024 instructions # 1.54 insn per cycle + 0.889438316 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.231336e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.478764e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.478764e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.092050e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.295990e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.295990e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 5.469982 sec - 16,730,831,486 cycles # 3.056 GHz - 41,270,909,044 instructions # 2.47 insn per cycle - 5.482091928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.138113 sec + 17,749,278,373 cycles # 2.890 GHz + 43,890,075,557 instructions # 2.47 insn per cycle + 6.149965364 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 467) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039854866802E-002 -Relative difference = 1.1313746984080878e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.472788e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.845298e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.845298e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.281832e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.528866e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.528866e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.902455 sec - 8,999,277,217 cycles # 3.096 GHz - 21,212,156,929 instructions # 2.36 insn per cycle - 2.920254424 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.131918 sec + 9,063,997,030 cycles # 2.890 GHz + 21,583,444,087 instructions # 2.38 insn per cycle + 3.172631085 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.606896e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.017656e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.017656e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.471429e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.850830e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.850830e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.765803 sec - 8,250,996,001 cycles # 2.978 GHz - 15,426,023,835 instructions # 1.87 insn per cycle - 2.784276443 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) +TOTAL : 2.922404 sec + 8,130,490,307 cycles # 2.776 GHz + 15,429,884,484 instructions # 1.90 insn per cycle + 2.941222784 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2542) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.659785e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.147153e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.147153e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.565898e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.093653e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.093653e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.718129 sec - 8,116,789,845 cycles # 2.981 GHz - 15,238,834,705 instructions # 1.88 insn per cycle - 2.738443955 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) +TOTAL : 2.826189 sec + 7,861,694,964 cycles # 2.776 GHz + 15,087,354,653 instructions # 1.92 insn per cycle + 2.844638276 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2323) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.665676e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.125551e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.125551e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.637184e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.244046e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.244046e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.716332 sec - 6,606,196,349 cycles # 2.427 GHz - 12,842,263,926 instructions # 1.94 insn per cycle - 2.735947842 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) +TOTAL : 2.766988 sec + 6,178,543,208 cycles # 2.228 GHz + 12,245,131,195 instructions # 1.98 insn per cycle + 2.787936795 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1538) (512y: 8) (512z: 1258) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052564145764E-002 -Relative difference = 1.9988585667912256e-07 +Avg ME (F77/C++) = 1.2828052431359538E-002 +Relative difference = 1.895346165094282e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 6aac347ebc..df038945e7 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:59:40 +DATE: 2024-01-30_05:34:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.268443e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.159820e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.242861e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.293279e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.189438e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.292426e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.585151 sec - 2,376,619,364 cycles # 2.930 GHz - 3,674,969,576 instructions # 1.55 insn per cycle - 0.869620998 seconds time elapsed +TOTAL : 0.574531 sec + 2,278,103,742 cycles # 2.838 GHz + 3,559,192,155 instructions # 1.56 insn per cycle + 0.862169679 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.681014e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.190697e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.190697e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.401205e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.755017e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.755017e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 4.111208 sec - 12,184,760,501 cycles # 2.960 GHz - 32,432,932,194 instructions # 2.66 insn per cycle - 4.116807187 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.863284 sec + 13,757,936,316 cycles # 2.826 GHz + 37,850,126,745 instructions # 2.75 insn per cycle + 4.870249581 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 833) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039840314887E-002 -Relative difference = 1.244813035273009e-08 +Avg ME (F77/C++) = 1.2828039414671366E-002 +Relative difference = 4.562884388571957e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.746012e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.644193e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.644193e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.651233e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.514070e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.514070e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.661942 sec - 8,002,880,266 cycles # 3.001 GHz - 18,657,307,287 instructions # 2.33 insn per cycle - 2.667590652 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.748031 sec + 7,929,384,882 cycles # 2.881 GHz + 18,604,713,730 instructions # 2.35 insn per cycle + 2.754502860 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2808) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039283704129E-002 -Relative difference = 5.583829420356249e-08 +Avg ME (F77/C++) = 1.2828039280066150E-002 +Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.839072e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.690688e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.690688e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.730630e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.541231e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.541231e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.575310 sec - 7,484,630,621 cycles # 2.901 GHz - 14,251,612,805 instructions # 1.90 insn per cycle - 2.581216132 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) +TOTAL : 2.679636 sec + 7,420,774,430 cycles # 2.764 GHz + 14,339,383,869 instructions # 1.93 insn per cycle + 2.686088553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2251) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053244447801E-002 -Relative difference = 2.5291823782248813e-07 +Avg ME (F77/C++) = 1.2828053246266791E-002 +Relative difference = 2.5306003563303186e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.852469e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.787188e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.787188e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.796396e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.739468e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.739468e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.574795 sec - 7,331,036,803 cycles # 2.845 GHz - 13,950,107,124 instructions # 1.90 insn per cycle - 2.580411131 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) +TOTAL : 2.625810 sec + 7,304,334,176 cycles # 2.778 GHz + 13,955,275,285 instructions # 1.91 insn per cycle + 2.632447793 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3875) (512y: 9) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053244447801E-002 -Relative difference = 2.5291823782248813e-07 +Avg ME (F77/C++) = 1.2828053277189611E-002 +Relative difference = 2.5547059841227576e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.587347e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.044303e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.044303e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.601296e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.146430e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.146430e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.795609 sec - 6,537,128,178 cycles # 2.335 GHz - 13,422,893,916 instructions # 2.05 insn per cycle - 2.801293417 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) +TOTAL : 2.796781 sec + 6,273,154,150 cycles # 2.239 GHz + 13,210,323,797 instructions # 2.11 insn per cycle + 2.803318258 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1734) (512y: 3) (512z: 1266) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052562326775E-002 -Relative difference = 1.997440588685788e-07 +Avg ME (F77/C++) = 1.2828052540498902E-002 +Relative difference = 1.980424851420537e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index 320612f062..784101060d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_15:00:08 +DATE: 2024-01-30_05:35:26 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.301880e+08 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.200050e+09 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.327102e+09 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.300997e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.192378e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.323768e+09 ) sec^-1 MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 -TOTAL : 0.577502 sec - 2,367,605,467 cycles # 2.931 GHz - 3,686,475,541 instructions # 1.56 insn per cycle - 0.865341028 seconds time elapsed +TOTAL : 0.574497 sec + 2,274,789,999 cycles # 2.831 GHz + 3,565,149,005 instructions # 1.57 insn per cycle + 0.863293975 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.231230e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.232983e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.232983e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.974769e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.758467e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.758467e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 3.189007 sec - 9,438,833,361 cycles # 2.955 GHz - 25,269,338,727 instructions # 2.68 insn per cycle - 3.194786598 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.560083 sec + 10,128,258,424 cycles # 2.841 GHz + 28,399,859,483 instructions # 2.80 insn per cycle + 3.566485849 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 632) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 -Avg ME (F77/C++) = 1.2828039838495897E-002 -Relative difference = 1.2589928273811243e-08 +Avg ME (F77/C++) = 1.2828039441956207E-002 +Relative difference = 4.35018750695023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.111513e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.795769e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.795769e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.921662e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.360866e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.360866e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 -TOTAL : 2.388364 sec - 7,189,276,734 cycles # 3.004 GHz - 16,868,514,931 instructions # 2.35 insn per cycle - 2.393974942 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.529327 sec + 7,292,501,410 cycles # 2.880 GHz + 16,787,289,445 instructions # 2.30 insn per cycle + 2.535811154 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2463) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.018803e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.200647e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.200647e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.902980e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.008268e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.008268e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.444205 sec - 7,132,947,258 cycles # 2.912 GHz - 13,616,405,803 instructions # 1.91 insn per cycle - 2.449801989 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) +TOTAL : 2.546290 sec + 7,099,294,688 cycles # 2.783 GHz + 13,729,465,706 instructions # 1.93 insn per cycle + 2.552602290 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2082) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053220800939E-002 -Relative difference = 2.5107486628541925e-07 +Avg ME (F77/C++) = 1.2828053198973066E-002 +Relative difference = 2.4937329255889414e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.049548e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.286960e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.286960e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.894124e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.023412e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.023412e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 -TOTAL : 2.421039 sec - 7,049,543,144 cycles # 2.906 GHz - 13,426,454,258 instructions # 1.90 insn per cycle - 2.426713774 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) +TOTAL : 2.549860 sec + 7,037,352,059 cycles # 2.755 GHz + 13,462,222,302 instructions # 1.91 insn per cycle + 2.556338558 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3649) (512y: 12) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828053220800939E-002 -Relative difference = 2.5107486628541925e-07 +Avg ME (F77/C++) = 1.2828053198973066E-002 +Relative difference = 2.4937329255889414e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.688665e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.273438e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.273438e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.741921e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.505340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.505340e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 -TOTAL : 2.707645 sec - 6,353,817,019 cycles # 2.343 GHz - 13,154,875,496 instructions # 2.07 insn per cycle - 2.713469152 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) +TOTAL : 2.671598 sec + 6,046,764,080 cycles # 2.259 GHz + 12,911,501,907 instructions # 2.14 insn per cycle + 2.677952936 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1671) (512y: 3) (512z: 1155) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 -Avg ME (F77/C++) = 1.2828052536860923E-002 -Relative difference = 1.977588895209662e-07 +Avg ME (F77/C++) = 1.2828052431359538E-002 +Relative difference = 1.895346165094282e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 675f8002f0..7a09642823 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:29:57 +DATE: 2024-01-30_04:54:00 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.421403e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.263390e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.120736e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.434258e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.281519e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.171049e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.699628 sec - 2,791,107,487 cycles # 2.987 GHz - 4,371,723,888 instructions # 1.57 insn per cycle - 1.020217670 seconds time elapsed +TOTAL : 0.704261 sec + 2,701,570,097 cycles # 2.831 GHz + 4,244,340,283 instructions # 1.57 insn per cycle + 1.033944641 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.110744e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.296406e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.296406e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.829628e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.139787e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.139787e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 6.059605 sec - 18,785,537,380 cycles # 3.098 GHz - 44,223,451,515 instructions # 2.35 insn per cycle - 6.072025963 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.819159 sec + 19,690,827,956 cycles # 2.885 GHz + 46,971,779,576 instructions # 2.39 insn per cycle + 6.832663552 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.746395e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.315536e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.315536e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.605344e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.116934e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.116934e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.998774 sec - 12,365,852,528 cycles # 3.089 GHz - 30,918,818,290 instructions # 2.50 insn per cycle - 4.019870048 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.334479 sec + 12,518,471,325 cycles # 2.884 GHz + 30,922,888,427 instructions # 2.47 insn per cycle + 4.354467708 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1667) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.045102e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.850310e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.850310e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.917239e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660472e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660472e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.473353 sec - 10,119,124,866 cycles # 2.909 GHz - 19,374,205,910 instructions # 1.91 insn per cycle - 3.489755488 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) +TOTAL : 3.702387 sec + 10,174,876,030 cycles # 2.745 GHz + 19,548,406,942 instructions # 1.92 insn per cycle + 3.720275920 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2119) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.189661e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.096081e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.096081e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.029293e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.888276e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.888276e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.261039 sec - 9,715,236,596 cycles # 2.974 GHz - 18,955,064,271 instructions # 1.95 insn per cycle - 3.284788216 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) +TOTAL : 3.515786 sec + 9,723,051,646 cycles # 2.761 GHz + 18,859,468,530 instructions # 1.94 insn per cycle + 3.531121351 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1850) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.871346e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.512373e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.512373e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.839848e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.512898e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.512898e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.767041 sec - 8,387,450,309 cycles # 2.224 GHz - 15,057,184,265 instructions # 1.80 insn per cycle - 3.785820964 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) +TOTAL : 3.838759 sec + 8,110,381,366 cycles # 2.110 GHz + 14,814,382,883 instructions # 1.83 insn per cycle + 3.856049832 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1023) (512y: 64) (512z: 1327) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 486522e284..385e9ed225 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2023-11-24_14:30:31 +DATE: 2024-01-30_04:54:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.608833e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.666599e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.153828e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.428632e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.291557e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.197877e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 0.689454 sec - 2,790,185,844 cycles # 3.012 GHz - 4,322,858,919 instructions # 1.55 insn per cycle - 1.008572761 seconds time elapsed +TOTAL : 0.704173 sec + 2,700,513,236 cycles # 2.833 GHz + 4,160,757,344 instructions # 1.54 insn per cycle + 1.040080983 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.152239e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.354641e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.354641e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.048898e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.230601e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.230601e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 5.858273 sec - 18,113,057,840 cycles # 3.089 GHz - 42,472,824,521 instructions # 2.34 insn per cycle - 5.871069159 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.414952 sec + 18,538,807,361 cycles # 2.888 GHz + 44,591,647,960 instructions # 2.41 insn per cycle + 6.426389730 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 498) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.781270e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.374250e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.374250e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.655305e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204388e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204388e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.930434 sec - 12,144,033,984 cycles # 3.086 GHz - 30,226,379,703 instructions # 2.49 insn per cycle - 3.949543769 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.214890 sec + 12,207,966,974 cycles # 2.892 GHz + 30,217,340,923 instructions # 2.48 insn per cycle + 4.236133486 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1650) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.103753e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.937318e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.937318e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.899712e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.627205e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.627205e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.379502 sec - 10,060,863,868 cycles # 2.972 GHz - 19,256,812,816 instructions # 1.91 insn per cycle - 3.401404400 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) +TOTAL : 3.730288 sec + 10,158,219,608 cycles # 2.719 GHz + 19,037,132,874 instructions # 1.87 insn per cycle + 3.746558078 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2072) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.196753e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.119301e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.119301e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.048047e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.931283e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.931283e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.254342 sec - 9,672,428,613 cycles # 2.968 GHz - 18,746,035,376 instructions # 1.94 insn per cycle - 3.271198287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) +TOTAL : 3.492411 sec + 9,571,391,969 cycles # 2.738 GHz + 18,453,150,608 instructions # 1.93 insn per cycle + 3.509341045 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1775) (512y: 174) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.930945e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.613686e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.613686e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.170414e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.170487e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.170487e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 -TOTAL : 3.660732 sec - 8,282,994,053 cycles # 2.260 GHz - 14,980,698,691 instructions # 1.81 insn per cycle - 3.677125812 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) +TOTAL : 3.317793 sec + 7,240,072,684 cycles # 2.179 GHz + 13,244,781,040 instructions # 1.83 insn per cycle + 3.341198784 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 911) (512y: 56) (512z: 993) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 8af0b3625a..2453732bed 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:31:04 +DATE: 2024-01-30_04:55:10 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.273933e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156544e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270780e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.010275e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.133419e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272295e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.525661 sec - 2,285,041,869 cycles # 2.996 GHz - 3,259,226,119 instructions # 1.43 insn per cycle - 0.837415991 seconds time elapsed +TOTAL : 0.538830 sec + 2,187,358,219 cycles # 2.824 GHz + 3,139,905,445 instructions # 1.44 insn per cycle + 0.856073288 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.147376e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.208578e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.208578e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.073581e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.135755e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.135755e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.988043 sec - 15,215,107,476 cycles # 3.051 GHz - 38,378,066,668 instructions # 2.52 insn per cycle - 4.996465245 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.168677 sec + 14,980,961,047 cycles # 2.896 GHz + 38,724,485,120 instructions # 2.58 insn per cycle + 5.178651966 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.686108e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.886642e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.886642e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.523460e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.721558e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.721558e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.950305 sec - 9,139,887,426 cycles # 3.093 GHz - 24,578,745,754 instructions # 2.69 insn per cycle - 2.962852368 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.090012 sec + 8,952,192,290 cycles # 2.893 GHz + 24,430,503,496 instructions # 2.73 insn per cycle + 3.108451490 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.966833e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.492744e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.492744e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.390626e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.850527e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.850527e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.861721 sec - 5,467,401,485 cycles # 2.927 GHz - 11,252,306,729 instructions # 2.06 insn per cycle - 1.878450325 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 2.056967 sec + 5,535,228,908 cycles # 2.683 GHz + 11,562,552,185 instructions # 2.09 insn per cycle + 2.068379535 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.607385e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.251361e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.251361e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.323214e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.965355e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.965355e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.691034 sec - 4,950,324,717 cycles # 2.917 GHz - 10,558,681,133 instructions # 2.13 insn per cycle - 1.704227686 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +TOTAL : 1.769440 sec + 4,825,692,035 cycles # 2.719 GHz + 10,341,008,591 instructions # 2.14 insn per cycle + 1.786949030 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.997999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.223536e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.223536e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.039053e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.289363e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.289363e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.729108 sec - 5,392,071,178 cycles # 1.972 GHz - 7,793,346,651 instructions # 1.45 insn per cycle - 2.742252965 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +TOTAL : 2.707049 sec + 4,944,236,176 cycles # 1.822 GHz + 7,554,838,116 instructions # 1.53 insn per cycle + 2.726854934 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index abc9cb4db6..adcfa48462 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:10:57 +DATE: 2024-01-30_05:46:39 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.408166e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.841952e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.841952e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.344134e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.848581e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.848581e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.827206 sec - 3,081,034,031 cycles # 2.876 GHz - 4,780,745,395 instructions # 1.55 insn per cycle - 1.128942393 seconds time elapsed +TOTAL : 0.830257 sec + 3,050,711,174 cycles # 2.837 GHz + 4,744,287,151 instructions # 1.56 insn per cycle + 1.134543078 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -81,20 +81,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.108776e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.169543e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.169543e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.051768e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.112380e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.112380e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.156626 sec - 15,517,564,279 cycles # 3.006 GHz - 38,434,109,341 instructions # 2.48 insn per cycle - 5.163690036 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.299158 sec + 15,311,911,023 cycles # 2.886 GHz + 38,783,796,929 instructions # 2.53 insn per cycle + 5.307164517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -102,27 +102,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.545882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.738725e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.738725e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.466739e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.657869e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.657869e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.141184 sec - 9,467,320,162 cycles # 3.008 GHz - 24,761,393,280 instructions # 2.62 insn per cycle - 3.148330326 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.216841 sec + 9,297,524,138 cycles # 2.885 GHz + 24,613,723,387 instructions # 2.65 insn per cycle + 3.224967553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -137,20 +137,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.651058e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.129257e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.129257e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.363369e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.815752e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.815752e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.039696 sec - 5,802,587,355 cycles # 2.836 GHz - 11,538,014,597 instructions # 1.99 insn per cycle - 2.046642189 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 2.149413 sec + 5,860,102,645 cycles # 2.720 GHz + 11,849,599,468 instructions # 2.02 insn per cycle + 2.157292568 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -165,20 +165,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.265451e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.861082e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.861082e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.162124e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.773170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.773170e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.859965 sec - 5,312,241,459 cycles # 2.848 GHz - 10,843,960,204 instructions # 2.04 insn per cycle - 1.867027334 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +TOTAL : 1.893501 sec + 5,161,881,245 cycles # 2.717 GHz + 10,626,023,875 instructions # 2.06 insn per cycle + 1.901369932 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -193,20 +193,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.763993e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.974534e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.974534e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.945106e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.186618e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.186618e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.977042 sec - 5,732,583,120 cycles # 1.922 GHz - 8,038,189,975 instructions # 1.40 insn per cycle - 2.984163935 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +TOTAL : 2.853446 sec + 5,298,812,686 cycles # 1.853 GHz + 7,800,536,018 instructions # 1.47 insn per cycle + 2.861501356 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index 44f3cc8c52..b23b4b948e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:24:19 +DATE: 2024-01-30_06:00:31 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.501067e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156482e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.276049e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.565155e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155605e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269580e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 0.626850 sec - 2,536,021,563 cycles # 2.961 GHz - 3,670,883,227 instructions # 1.45 insn per cycle - 0.915863654 seconds time elapsed +TOTAL : 0.625620 sec + 2,433,553,514 cycles # 2.839 GHz + 3,531,317,325 instructions # 1.45 insn per cycle + 0.914840106 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.139594e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.203265e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.203265e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.073060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.134707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.134707e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 5.067424 sec - 15,399,016,558 cycles # 3.036 GHz - 38,390,017,774 instructions # 2.49 insn per cycle - 5.073191640 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.229961 sec + 15,157,435,498 cycles # 2.896 GHz + 38,739,723,091 instructions # 2.56 insn per cycle + 5.236486145 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.625228e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.825269e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.825269e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.526696e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.723798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.723798e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.057820 sec - 9,317,797,118 cycles # 3.043 GHz - 24,577,704,184 instructions # 2.64 insn per cycle - 3.063889144 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.143207 sec + 9,122,833,846 cycles # 2.898 GHz + 24,428,638,513 instructions # 2.68 insn per cycle + 3.149727451 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.671770e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.193648e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.193648e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.453278e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.923487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.923487e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.021111 sec - 5,663,540,101 cycles # 2.803 GHz - 11,235,902,290 instructions # 1.98 insn per cycle - 2.027218591 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 2.094012 sec + 5,713,399,327 cycles # 2.721 GHz + 11,544,398,198 instructions # 2.02 insn per cycle + 2.100575275 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.440843e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.078509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.078509e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.340982e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.000324e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.000324e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.794383 sec - 5,172,944,461 cycles # 2.875 GHz - 10,505,659,530 instructions # 2.03 insn per cycle - 1.800280603 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +TOTAL : 1.826424 sec + 5,007,819,577 cycles # 2.734 GHz + 10,288,512,439 instructions # 2.05 insn per cycle + 1.833139039 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.980860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.211163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.211163e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.024689e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.274198e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.274198e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.800543 sec - 5,575,018,302 cycles # 1.987 GHz - 7,742,183,831 instructions # 1.39 insn per cycle - 2.806534120 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +TOTAL : 2.778758 sec + 5,115,298,192 cycles # 1.837 GHz + 7,503,411,062 instructions # 1.47 insn per cycle + 2.785395708 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index af2440539e..66a621d02a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:20:58 +DATE: 2024-01-30_05:57:01 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.564058e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155046e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.272805e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.578143e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.159887e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277521e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.562322 sec - 2,282,427,973 cycles # 2.881 GHz - 3,556,665,675 instructions # 1.56 insn per cycle - 0.849543273 seconds time elapsed +TOTAL : 0.567703 sec + 2,256,406,335 cycles # 2.832 GHz + 3,552,290,336 instructions # 1.57 insn per cycle + 0.856591173 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.099614e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.160783e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.160783e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.061569e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.123242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.123242e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.101479 sec - 15,194,748,744 cycles # 2.976 GHz - 38,374,370,448 instructions # 2.53 insn per cycle - 5.107016878 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.196625 sec + 14,980,592,489 cycles # 2.880 GHz + 38,723,298,937 instructions # 2.58 insn per cycle + 5.203366404 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.574526e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.771653e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.771653e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.518700e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.715553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.715553e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.040842 sec - 9,128,651,326 cycles # 2.997 GHz - 24,578,551,986 instructions # 2.69 insn per cycle - 3.046498874 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.090850 sec + 8,946,489,145 cycles # 2.890 GHz + 24,429,263,818 instructions # 2.73 insn per cycle + 3.097198356 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.714772e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.221583e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.221583e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.476437e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.948509e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.948509e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.940536 sec - 5,479,977,577 cycles # 2.817 GHz - 11,251,107,887 instructions # 2.05 insn per cycle - 1.946094154 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 2.025257 sec + 5,523,468,825 cycles # 2.720 GHz + 11,561,737,650 instructions # 2.09 insn per cycle + 2.031752517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.393849e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.021326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.021326e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.358069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007551e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.745575 sec - 4,952,345,278 cycles # 2.829 GHz - 10,558,643,656 instructions # 2.13 insn per cycle - 1.751295687 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +TOTAL : 1.759129 sec + 4,801,841,802 cycles # 2.722 GHz + 10,338,992,386 instructions # 2.15 insn per cycle + 1.765685267 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.913352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.137894e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.137894e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.036811e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.287808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.287808e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.786489 sec - 5,383,532,315 cycles # 1.930 GHz - 7,793,484,712 instructions # 1.45 insn per cycle - 2.792074852 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +TOTAL : 2.707972 sec + 4,942,835,417 cycles # 1.822 GHz + 7,554,452,946 instructions # 1.53 insn per cycle + 2.714536601 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index e2193a8ad7..defb46a739 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:17:39 +DATE: 2024-01-30_05:53:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.736010e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.153457e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.268173e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.688012e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154108e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.269539e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.714625 sec - 2,774,816,362 cycles # 2.935 GHz - 4,339,252,047 instructions # 1.56 insn per cycle - 1.002590963 seconds time elapsed +TOTAL : 0.720498 sec + 2,707,766,521 cycles # 2.848 GHz + 4,278,662,893 instructions # 1.58 insn per cycle + 1.009865256 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -74,20 +74,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.123090e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.185367e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.185367e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.066193e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.127901e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.127901e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 5.045686 sec - 15,189,366,713 cycles # 3.007 GHz - 38,373,841,814 instructions # 2.53 insn per cycle - 5.051730938 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.184582 sec + 14,984,631,159 cycles # 2.888 GHz + 38,723,388,390 instructions # 2.58 insn per cycle + 5.191155299 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 719) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -95,26 +95,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388515645 -Relative difference = 3.258803994438787e-07 +Avg ME (F77/C++) = 2.0288063388515649 +Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.586999e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.782954e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.782954e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.511860e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.708377e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.708377e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.030197 sec - 9,119,166,751 cycles # 3.005 GHz - 24,577,924,210 instructions # 2.70 insn per cycle - 3.035979194 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.095761 sec + 8,950,231,816 cycles # 2.886 GHz + 24,430,052,071 instructions # 2.73 insn per cycle + 3.102564983 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -128,20 +128,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.732847e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.231342e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.231342e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.454029e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.925499e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.925499e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.935219 sec - 5,468,637,644 cycles # 2.819 GHz - 11,251,012,735 instructions # 2.06 insn per cycle - 1.941019031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) +TOTAL : 2.033615 sec + 5,531,582,240 cycles # 2.713 GHz + 11,562,288,179 instructions # 2.09 insn per cycle + 2.040383969 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -155,20 +155,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.292655e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.897577e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.897577e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.327959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.977069e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.977069e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.772120 sec - 4,969,624,681 cycles # 2.796 GHz - 10,556,732,876 instructions # 2.12 insn per cycle - 1.777837624 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) +TOTAL : 1.768254 sec + 4,816,907,251 cycles # 2.716 GHz + 10,339,308,595 instructions # 2.15 insn per cycle + 1.774968996 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1972) (512y: 131) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -182,20 +182,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.866239e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.092519e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.092519e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.992436e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.241387e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.241387e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.846913 sec - 5,391,039,174 cycles # 1.904 GHz - 7,794,494,031 instructions # 1.45 insn per cycle - 2.856378481 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) +TOTAL : 2.737995 sec + 4,943,973,305 cycles # 1.803 GHz + 7,555,690,658 instructions # 1.53 insn per cycle + 2.744582139 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1212) (512y: 65) (512z: 1543) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 977d853d64..fe6f195aa6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:31:31 +DATE: 2024-01-30_04:55:39 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.446963e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.163595e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.279925e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.125481e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158117e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273663e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.520650 sec - 2,264,469,596 cycles # 3.001 GHz - 3,268,925,935 instructions # 1.44 insn per cycle - 0.824307613 seconds time elapsed +TOTAL : 0.534485 sec + 2,191,778,134 cycles # 2.834 GHz + 3,140,951,827 instructions # 1.43 insn per cycle + 0.850685752 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.178244e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.243144e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.243144e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.109309e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.173415e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.173415e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.920762 sec - 15,120,754,616 cycles # 3.070 GHz - 40,101,358,095 instructions # 2.65 insn per cycle - 4.928885973 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.081724 sec + 14,685,294,357 cycles # 2.887 GHz + 39,544,026,748 instructions # 2.69 insn per cycle + 5.093038112 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 596) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.908591e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.133379e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.133379e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.661768e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.875473e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.875473e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.787534 sec - 8,670,203,341 cycles # 3.104 GHz - 23,670,756,212 instructions # 2.73 insn per cycle - 2.801209294 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.973090 sec + 8,600,238,365 cycles # 2.886 GHz + 23,576,508,735 instructions # 2.74 insn per cycle + 2.991032269 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1948) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.339479e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.760864e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.760864e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.966204e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.352181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.352181e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.067737 sec - 6,095,582,988 cycles # 2.940 GHz - 13,061,573,034 instructions # 2.14 insn per cycle - 2.079447172 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) +TOTAL : 2.222703 sec + 5,964,350,122 cycles # 2.676 GHz + 13,193,903,385 instructions # 2.21 insn per cycle + 2.290428549 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2560) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.617733e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.075014e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.075014e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.425705e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.897406e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.897406e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.970361 sec - 5,800,772,939 cycles # 2.935 GHz - 12,319,856,699 instructions # 2.12 insn per cycle - 1.986108854 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) +TOTAL : 2.043603 sec + 5,539,021,528 cycles # 2.702 GHz + 12,103,311,893 instructions # 2.19 insn per cycle + 2.060365335 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2030) (512y: 278) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.793878e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.997441e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.997441e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.662802e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.870728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.870728e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.868253 sec - 5,826,829,144 cycles # 2.028 GHz - 9,602,061,917 instructions # 1.65 insn per cycle - 2.882159739 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) +TOTAL : 2.974749 sec + 5,366,303,915 cycles # 1.800 GHz + 9,381,926,109 instructions # 1.75 insn per cycle + 2.994553633 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1350) (512y: 88) (512z: 1989) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index eaf53289b1..8cd37966a9 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:00:34 +DATE: 2024-01-30_05:35:53 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.552166e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156300e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.271682e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.561376e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154966e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270589e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.524921 sec - 2,220,579,587 cycles # 2.921 GHz - 3,192,850,808 instructions # 1.44 insn per cycle - 0.818936986 seconds time elapsed +TOTAL : 0.529335 sec + 2,159,762,911 cycles # 2.829 GHz + 3,107,803,545 instructions # 1.44 insn per cycle + 0.822533200 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.433349e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.518013e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.518013e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.227004e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.298943e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298943e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.419477 sec - 13,028,620,235 cycles # 2.950 GHz - 34,390,570,196 instructions # 2.64 insn per cycle - 4.425198715 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.818572 sec + 13,907,927,893 cycles # 2.883 GHz + 35,849,684,316 instructions # 2.58 insn per cycle + 4.825096940 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1078) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.068689e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.210680e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.210680e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.848483e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.087109e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.087109e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.525285 sec - 10,592,983,737 cycles # 3.000 GHz - 24,007,204,141 instructions # 2.27 insn per cycle - 3.531288478 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.835129 sec + 8,213,185,511 cycles # 2.892 GHz + 21,908,282,308 instructions # 2.67 insn per cycle + 2.841971377 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2334) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.591275e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.918287e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.918287e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.473983e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.948336e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.948336e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.392081 sec - 6,643,133,250 cycles # 2.771 GHz - 12,401,445,774 instructions # 1.87 insn per cycle - 2.398243451 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) +TOTAL : 2.025782 sec + 5,530,364,572 cycles # 2.723 GHz + 12,076,349,288 instructions # 2.18 insn per cycle + 2.032542267 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3062) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.018329e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.394522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.394522e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.936500e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.499652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.499652e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.195662 sec - 6,234,944,949 cycles # 2.833 GHz - 11,573,018,455 instructions # 1.86 insn per cycle - 2.201415670 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) +TOTAL : 1.876359 sec + 5,112,015,535 cycles # 2.716 GHz + 11,141,551,976 instructions # 2.18 insn per cycle + 1.883163972 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 224) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288063388516200 -Relative difference = 3.2588037208240405e-07 +Avg ME (F77/C++) = 2.0288063388516204 +Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.926324e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.149997e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.149997e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.149105e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.416003e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.416003e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.778213 sec - 5,362,353,364 cycles # 1.927 GHz - 9,296,610,904 instructions # 1.73 insn per cycle - 2.784120643 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) +TOTAL : 2.637284 sec + 4,829,728,502 cycles # 1.827 GHz + 8,842,382,666 instructions # 1.83 insn per cycle + 2.644418009 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 97) (512z: 2034) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 223a6bbd07..8eec31c0d3 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:01:02 +DATE: 2024-01-30_05:36:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.545579e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.156991e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274409e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.565410e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.157958e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274503e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.524046 sec - 2,220,143,747 cycles # 2.921 GHz - 3,161,876,071 instructions # 1.42 insn per cycle - 0.819384909 seconds time elapsed +TOTAL : 0.528794 sec + 2,178,979,969 cycles # 2.840 GHz + 3,111,172,536 instructions # 1.43 insn per cycle + 0.825662442 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.624216e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.719163e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.719163e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.483554e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.573797e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.573797e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.103081 sec - 12,357,400,963 cycles # 3.008 GHz - 35,038,305,221 instructions # 2.84 insn per cycle - 4.108989255 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.331634 sec + 12,513,147,299 cycles # 2.885 GHz + 35,729,824,625 instructions # 2.86 insn per cycle + 4.338115382 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.981089e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.114332e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.114332e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.944859e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.193242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.193242e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 3.624381 sec - 10,699,873,531 cycles # 2.953 GHz - 23,087,357,159 instructions # 2.16 insn per cycle - 3.630253133 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.766913 sec + 8,026,265,535 cycles # 2.895 GHz + 21,260,291,484 instructions # 2.65 insn per cycle + 2.773559046 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2088) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.093491e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.483832e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.483832e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.719292e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.240372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.240372e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.165104 sec - 6,157,585,167 cycles # 2.837 GHz - 11,956,600,181 instructions # 1.94 insn per cycle - 2.170909922 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) +TOTAL : 1.943097 sec + 5,300,809,350 cycles # 2.722 GHz + 11,405,959,044 instructions # 2.15 insn per cycle + 1.950186269 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2370) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -153,20 +153,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.214844e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.619393e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.619393e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.116224e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.720108e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.720108e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.116517 sec - 6,017,617,016 cycles # 2.837 GHz - 11,127,934,643 instructions # 1.85 insn per cycle - 2.122364748 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) +TOTAL : 1.828206 sec + 4,977,318,735 cycles # 2.718 GHz + 10,599,506,112 instructions # 2.13 insn per cycle + 1.834822870 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1970) (512y: 162) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -180,20 +180,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.073709e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.313809e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.313809e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.275159e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.556705e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.556705e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.680169 sec - 5,188,375,015 cycles # 1.932 GHz - 9,020,642,676 instructions # 1.74 insn per cycle - 2.686120218 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) +TOTAL : 2.563497 sec + 4,703,376,134 cycles # 1.831 GHz + 8,567,908,292 instructions # 1.82 insn per cycle + 2.570320519 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1392) (512y: 70) (512z: 1630) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 5b41a4d066..03334a40e8 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:31:58 +DATE: 2024-01-30_04:56:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.016432e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.646338e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.972972e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.266078e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.583524e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.962786e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.480351 sec - 2,093,260,903 cycles # 2.971 GHz - 2,988,705,515 instructions # 1.43 insn per cycle - 0.774971081 seconds time elapsed +TOTAL : 0.486964 sec + 2,022,378,491 cycles # 2.826 GHz + 2,872,554,108 instructions # 1.42 insn per cycle + 0.794836465 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.337300e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.415066e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.415066e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.220233e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.293728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.293728e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.572712 sec - 14,005,022,680 cycles # 3.059 GHz - 38,340,620,868 instructions # 2.74 insn per cycle - 4.581007114 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.812282 sec + 13,901,639,181 cycles # 2.885 GHz + 37,078,732,469 instructions # 2.67 insn per cycle + 4.824222975 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.261983e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.699946e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.699946e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.150516e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.595808e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.595808e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.076354 sec - 6,472,095,043 cycles # 3.109 GHz - 15,815,873,405 instructions # 2.44 insn per cycle - 2.093110720 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.124737 sec + 6,168,101,005 cycles # 2.895 GHz + 15,212,489,109 instructions # 2.47 insn per cycle + 2.142108549 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.625847e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.106546e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.106546e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.173558 sec - 3,465,806,997 cycles # 2.939 GHz - 7,594,629,416 instructions # 2.19 insn per cycle - 1.191325878 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.954385e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.029179e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.029179e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.259990 sec + 3,437,290,204 cycles # 2.715 GHz + 7,715,643,345 instructions # 2.24 insn per cycle + 1.287994689 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.036192e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.205283e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.205283e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.095666 sec - 3,249,594,094 cycles # 2.950 GHz - 7,203,151,855 instructions # 2.22 insn per cycle - 1.112919323 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.805420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.144112e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.144112e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.162105 sec + 3,179,163,625 cycles # 2.727 GHz + 7,109,925,739 instructions # 2.24 insn per cycle + 1.178171652 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.709924e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.585781e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.585781e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.071814e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.862424e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.862424e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.443704 sec - 3,056,175,378 cycles # 2.109 GHz - 5,834,850,495 instructions # 1.91 insn per cycle - 1.456542629 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +TOTAL : 1.572617 sec + 2,980,157,633 cycles # 1.888 GHz + 5,763,820,562 instructions # 1.93 insn per cycle + 1.590552097 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index e2569bda32..3a80a864ae 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:11:25 +DATE: 2024-01-30_05:47:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.002389e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.469916e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.469916e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.753522e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.358863e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.358863e+07 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.673572 sec - 2,669,669,937 cycles # 2.942 GHz - 4,126,698,700 instructions # 1.55 insn per cycle - 0.966897204 seconds time elapsed +TOTAL : 0.684539 sec + 2,591,525,623 cycles # 2.839 GHz + 3,989,244,311 instructions # 1.54 insn per cycle + 0.972564077 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -81,20 +81,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.271859e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.346202e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.346202e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.212668e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.285744e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.285744e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.744076 sec - 14,203,194,064 cycles # 2.991 GHz - 38,386,589,589 instructions # 2.70 insn per cycle - 4.750636160 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.870798 sec + 14,070,285,227 cycles # 2.885 GHz + 37,122,197,019 instructions # 2.64 insn per cycle + 4.878379515 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -102,27 +102,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.023528e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.430819e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.430819e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.080420e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.515170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.515170e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.219353 sec - 6,668,132,532 cycles # 2.997 GHz - 16,095,906,246 instructions # 2.41 insn per cycle - 2.225914329 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.198766 sec + 6,358,773,769 cycles # 2.884 GHz + 15,492,113,204 instructions # 2.44 insn per cycle + 2.206392318 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -130,27 +130,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.036698e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.035040e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.035040e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.291419 sec - 3,685,819,556 cycles # 2.841 GHz - 7,831,220,798 instructions # 2.12 insn per cycle - 1.298093181 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.787706e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.007873e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.007873e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.328722 sec + 3,633,771,509 cycles # 2.722 GHz + 7,954,097,743 instructions # 2.19 insn per cycle + 1.336366634 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -158,27 +158,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.776675e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.132413e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.132413e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.202817 sec - 3,439,533,821 cycles # 2.846 GHz - 7,440,100,221 instructions # 2.16 insn per cycle - 1.209376262 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.612179e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.118037e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.118037e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.225517 sec + 3,366,927,421 cycles # 2.733 GHz + 7,347,508,752 instructions # 2.18 insn per cycle + 1.232992993 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -186,27 +186,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.099551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.865069e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.865069e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.960005e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.722467e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.722467e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.609144 sec - 3,270,738,224 cycles # 2.025 GHz - 6,088,987,367 instructions # 1.86 insn per cycle - 1.615635336 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +TOTAL : 1.642570 sec + 3,181,631,608 cycles # 1.930 GHz + 6,021,725,956 instructions # 1.89 insn per cycle + 1.650041277 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -214,8 +214,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index 1279e79222..38a7216065 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:24:46 +DATE: 2024-01-30_06:00:59 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.310797e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.615494e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.945034e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.412461e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.631522e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.951868e+08 ) sec^-1 MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 -TOTAL : 0.567939 sec - 2,336,393,305 cycles # 2.964 GHz - 3,415,377,807 instructions # 1.46 insn per cycle - 0.847416380 seconds time elapsed +TOTAL : 0.573986 sec + 2,244,851,144 cycles # 2.822 GHz + 3,300,445,554 instructions # 1.47 insn per cycle + 0.853607464 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.285858e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.359466e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.359466e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 4.728147 sec - 14,182,471,733 cycles # 2.997 GHz - 38,371,003,858 instructions # 2.71 insn per cycle - 4.733810751 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 2.218192e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.291861e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.291861e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 +TOTAL : 4.876695 sec + 14,064,697,494 cycles # 2.884 GHz + 37,110,369,611 instructions # 2.64 insn per cycle + 4.882981134 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.888028e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.282207e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.282207e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 2.288230 sec - 6,645,756,806 cycles # 2.898 GHz - 15,828,500,466 instructions # 2.38 insn per cycle - 2.293948357 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.131220e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.575839e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.575839e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.079572e+00 +- 3.404711e-03 ) GeV^0 +TOTAL : 2.187823 sec + 6,322,431,284 cycles # 2.883 GHz + 15,223,876,723 instructions # 2.41 insn per cycle + 2.194184928 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.344568e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.074101e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.074101e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.948892e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.027773e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.027773e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.263326 sec - 3,638,417,018 cycles # 2.869 GHz - 7,579,208,332 instructions # 2.08 insn per cycle - 1.269045825 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +TOTAL : 1.320835 sec + 3,601,071,923 cycles # 2.719 GHz + 7,699,828,133 instructions # 2.14 insn per cycle + 1.327138068 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.006852e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.170884e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.170884e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.790537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.142626e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.142626e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.182767 sec - 3,425,925,806 cycles # 2.885 GHz - 7,153,954,116 instructions # 2.09 insn per cycle - 1.188401535 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +TOTAL : 1.218690 sec + 3,342,798,362 cycles # 2.731 GHz + 7,059,572,278 instructions # 2.11 insn per cycle + 1.225217680 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.384551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.223950e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.223950e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.022088e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.806836e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.806836e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.561958 sec - 3,241,666,935 cycles # 2.069 GHz - 5,785,669,906 instructions # 1.78 insn per cycle - 1.567670740 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +TOTAL : 1.641075 sec + 3,147,503,652 cycles # 1.912 GHz + 5,713,849,148 instructions # 1.82 insn per cycle + 1.647331874 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index c75f243f6b..cb54d3236b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:21:25 +DATE: 2024-01-30_05:57:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.425944e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.638582e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.958027e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.414196e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.655173e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.981062e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.512465 sec - 2,128,740,635 cycles # 2.913 GHz - 3,349,486,673 instructions # 1.57 insn per cycle - 0.789824764 seconds time elapsed +TOTAL : 0.514856 sec + 2,089,539,478 cycles # 2.840 GHz + 3,296,506,746 instructions # 1.58 insn per cycle + 0.794530995 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.258814e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.334265e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.334265e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.227183e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.300870e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.300870e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.729033 sec - 14,008,183,941 cycles # 2.959 GHz - 38,341,321,376 instructions # 2.74 insn per cycle - 4.734430894 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.796202 sec + 13,896,514,461 cycles # 2.894 GHz + 37,078,595,071 instructions # 2.67 insn per cycle + 4.803618427 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.915242e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.303544e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.303544e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.077190e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.527451e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.527451e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.218605 sec - 6,466,211,293 cycles # 2.909 GHz - 15,815,641,765 instructions # 2.45 insn per cycle - 2.223961450 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.154367 sec + 6,177,704,022 cycles # 2.870 GHz + 15,215,532,210 instructions # 2.46 insn per cycle + 2.160620609 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.288130e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.067419e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.067419e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.215145 sec - 3,455,872,602 cycles # 2.833 GHz - 7,593,852,686 instructions # 2.20 insn per cycle - 1.220604147 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.911398e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.023141e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.023141e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.265022 sec + 3,447,761,650 cycles # 2.714 GHz + 7,715,058,636 instructions # 2.24 insn per cycle + 1.271511064 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.949365e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155182e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.155182e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.138566 sec - 3,249,226,385 cycles # 2.842 GHz - 7,201,814,152 instructions # 2.22 insn per cycle - 1.144215776 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.829060e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.147412e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.147412e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.155347 sec + 3,170,001,813 cycles # 2.731 GHz + 7,109,524,161 instructions # 2.24 insn per cycle + 1.161808340 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.332937e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.137553e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.137553e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.999480e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.774350e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.774350e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.516868 sec - 3,052,773,549 cycles # 2.006 GHz - 5,834,286,886 instructions # 1.91 insn per cycle - 1.522319115 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +TOTAL : 1.586909 sec + 2,978,718,352 cycles # 1.871 GHz + 5,762,941,941 instructions # 1.93 insn per cycle + 1.593095591 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index bfc5cc0709..5939268227 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:18:07 +DATE: 2024-01-30_05:54:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.776083e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.627754e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.948386e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.468280e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.632924e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.955888e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 -TOTAL : 0.618119 sec - 2,443,289,573 cycles # 2.920 GHz - 3,806,362,954 instructions # 1.56 insn per cycle - 0.895384570 seconds time elapsed +TOTAL : 0.625640 sec + 2,402,533,839 cycles # 2.841 GHz + 3,758,306,095 instructions # 1.56 insn per cycle + 0.905223049 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -74,20 +74,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.285771e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.360708e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.360708e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.221197e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.294941e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.294941e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.672900 sec - 14,007,447,781 cycles # 2.995 GHz - 38,341,188,487 instructions # 2.74 insn per cycle - 4.678325774 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.809027 sec + 13,889,421,482 cycles # 2.885 GHz + 37,078,742,557 instructions # 2.67 insn per cycle + 4.815296717 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 578) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -95,26 +95,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199022179469 -Relative difference = 4.819651478256564e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.077746e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.495127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.495127e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.146065e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.592205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.592205e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.152723 sec - 6,468,101,195 cycles # 2.999 GHz - 15,815,386,942 instructions # 2.45 insn per cycle - 2.158120643 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.125330 sec + 6,161,438,553 cycles # 2.892 GHz + 15,211,397,983 instructions # 2.47 insn per cycle + 2.131726868 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2459) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -122,26 +122,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.236653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.062344e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.062344e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.220471 sec - 3,470,157,914 cycles # 2.832 GHz - 7,593,755,062 instructions # 2.19 insn per cycle - 1.225848326 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.991330e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.034099e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.034099e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.255088 sec + 3,440,029,043 cycles # 2.730 GHz + 7,714,775,848 instructions # 2.24 insn per cycle + 1.261283713 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3071) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -149,26 +149,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.915047e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.151420e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.151420e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.142912 sec - 3,256,437,189 cycles # 2.838 GHz - 7,202,634,682 instructions # 2.21 insn per cycle - 1.148660140 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 9.843583e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.149362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.149362e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.153935 sec + 3,172,826,861 cycles # 2.738 GHz + 7,109,210,779 instructions # 2.24 insn per cycle + 1.160268530 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2733) (512y: 13) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,26 +176,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181684445590 -Relative difference = 8.302595855806234e-08 +Avg ME (F77/C++) = 2.0288179996423423 +Relative difference = 1.7628858734720142e-10 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.280239e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.088898e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.088898e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.077925e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.872855e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.872855e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.526066 sec - 3,058,452,115 cycles # 1.998 GHz - 5,834,337,483 instructions # 1.91 insn per cycle - 1.531519039 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) +TOTAL : 1.570762 sec + 2,979,903,068 cycles # 1.891 GHz + 5,762,829,882 instructions # 1.93 insn per cycle + 1.577195857 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2088) (512y: 20) (512z: 1914) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -203,8 +203,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183349184692 -Relative difference = 1.6508058850146622e-07 +Avg ME (F77/C++) = 2.0288183195516467 +Relative difference = 1.5750631496822894e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 5c8f13a099..c96a0bb3db 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:32:21 +DATE: 2024-01-30_04:56:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.367052e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.685993e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.017698e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.421312e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.704045e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.041754e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.471621 sec - 2,109,591,981 cycles # 3.033 GHz - 3,025,510,144 instructions # 1.43 insn per cycle - 0.774479825 seconds time elapsed +TOTAL : 0.486618 sec + 2,018,521,842 cycles # 2.827 GHz + 2,837,894,141 instructions # 1.41 insn per cycle + 0.795623791 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.266296e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.337160e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.337160e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.245629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.320181e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.320181e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.710609 sec - 14,404,518,200 cycles # 3.055 GHz - 39,833,821,591 instructions # 2.77 insn per cycle - 4.718963135 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.758439 sec + 13,805,800,630 cycles # 2.898 GHz + 37,480,161,839 instructions # 2.71 insn per cycle + 4.770650257 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 503) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199028000236 -Relative difference = 4.790961076489297e-08 +Avg ME (F77/C++) = 2.0288197983754799 +Relative difference = 9.938019153537065e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.070167e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.657007e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.657007e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.821274e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.398672e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.398672e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.811511 sec - 5,577,453,918 cycles # 3.070 GHz - 15,285,624,341 instructions # 2.74 insn per cycle - 1.824268128 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.889404 sec + 5,475,292,589 cycles # 2.889 GHz + 15,244,893,114 instructions # 2.78 insn per cycle + 1.908184587 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2330) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193548331037 -Relative difference = 1.748963824709674e-07 +Avg ME (F77/C++) = 2.0288191968575120 +Relative difference = 9.703059369476286e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.861603e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.557183e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.557183e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.611040 sec - 4,742,441,153 cycles # 2.934 GHz - 9,735,233,054 instructions # 2.05 insn per cycle - 1.623131424 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.385813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.037637e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.037637e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.731302 sec + 4,719,001,422 cycles # 2.717 GHz + 9,850,811,081 instructions # 2.09 insn per cycle + 1.750777348 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3750) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182108197361 -Relative difference = 1.0391259163456515e-07 +Avg ME (F77/C++) = 2.0288180243223906 +Relative difference = 1.1988453753912676e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.988259e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.740532e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.740532e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.584956 sec - 4,623,129,145 cycles # 2.906 GHz - 9,326,921,854 instructions # 2.02 insn per cycle - 1.599836475 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.683577e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.409489e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.409489e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.659075 sec + 4,492,699,411 cycles # 2.699 GHz + 9,202,452,349 instructions # 2.05 insn per cycle + 1.671352513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3384) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182108197361 -Relative difference = 1.0391259163456515e-07 +Avg ME (F77/C++) = 2.0288180243223906 +Relative difference = 1.1988453753912676e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.277364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.853368e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.853368e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.938211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.486110e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.486110e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.753984 sec - 3,644,734,123 cycles # 2.072 GHz - 7,034,553,715 instructions # 1.93 insn per cycle - 1.766522849 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2605) (512y: 12) (512z: 2221) +TOTAL : 1.854854 sec + 3,463,720,216 cycles # 1.861 GHz + 6,875,040,962 instructions # 1.98 insn per cycle + 1.876340349 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2257) (512y: 8) (512z: 2261) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183459779248 -Relative difference = 1.7053177021099307e-07 +Avg ME (F77/C++) = 2.0288183217635378 +Relative difference = 1.5859655131013432e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 665123002a..993f4107d6 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:01:29 +DATE: 2024-01-30_05:36:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.406259e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.654901e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.974381e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.377362e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.649325e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.974675e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.478015 sec - 2,052,116,202 cycles # 2.923 GHz - 2,932,366,143 instructions # 1.43 insn per cycle - 0.759588643 seconds time elapsed +TOTAL : 0.484150 sec + 2,005,186,574 cycles # 2.831 GHz + 2,872,226,914 instructions # 1.43 insn per cycle + 0.768013554 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.529261e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.621856e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.621856e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.479081e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.570421e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.570421e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 4.232103 sec - 12,595,483,741 cycles # 2.973 GHz - 34,372,550,033 instructions # 2.73 insn per cycle - 4.237616805 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.318821 sec + 12,411,469,267 cycles # 2.871 GHz + 34,216,954,204 instructions # 2.76 insn per cycle + 4.325006925 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 768) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199094356969 -Relative difference = 4.463890496342449e-08 +Avg ME (F77/C++) = 2.0288199088536203 +Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.409282e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.897832e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.897832e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.935196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.540988e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.540988e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 2.025230 sec - 6,107,661,405 cycles # 3.009 GHz - 14,860,133,021 instructions # 2.43 insn per cycle - 2.031040801 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.856505 sec + 5,363,525,325 cycles # 2.881 GHz + 14,587,825,944 instructions # 2.72 insn per cycle + 1.863141926 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2947) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193803280592 -Relative difference = 1.8746278463897685e-07 +Avg ME (F77/C++) = 2.0288192580919713 +Relative difference = 1.2721291123071246e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.413017e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.265518e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.265518e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.499210 sec - 4,274,397,689 cycles # 2.842 GHz - 9,028,681,209 instructions # 2.11 insn per cycle - 1.505009031 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.475828e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.385170e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.385170e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.492139 sec + 4,058,079,431 cycles # 2.710 GHz + 9,088,895,483 instructions # 2.24 insn per cycle + 1.498802038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4501) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181999931112 -Relative difference = 9.857617164523888e-08 +Avg ME (F77/C++) = 2.0288180499337614 +Relative difference = 2.4612242975974814e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.545085e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.419779e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.419779e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.475217 sec - 4,206,901,352 cycles # 2.842 GHz - 8,664,206,183 instructions # 2.06 insn per cycle - 1.480947152 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.052179e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.125609e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.125609e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.390912 sec + 3,795,132,868 cycles # 2.718 GHz + 8,440,638,214 instructions # 2.22 insn per cycle + 1.397579629 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4043) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288181999931112 -Relative difference = 9.857617164523888e-08 +Avg ME (F77/C++) = 2.0288180499337614 +Relative difference = 2.4612242975974814e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.444457e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.903029e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.903029e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.426211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.889827e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.889827e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 2.015920 sec - 3,844,199,069 cycles # 1.909 GHz - 7,810,135,811 instructions # 2.03 insn per cycle - 2.021868576 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4419) (512y: 0) (512z: 2556) +TOTAL : 2.021883 sec + 3,727,709,927 cycles # 1.839 GHz + 7,572,021,248 instructions # 2.03 insn per cycle + 2.028341317 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3646) (512y: 1) (512z: 2853) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183246739209 -Relative difference = 1.6003107281264138e-07 +Avg ME (F77/C++) = 2.0288183350348845 +Relative difference = 1.6513796936156652e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index fa97bf17a1..2891f046ff 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_15:01:53 +DATE: 2024-01-30_05:37:12 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.425807e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.669791e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.006545e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.485748e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.689974e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.027356e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 -TOTAL : 0.480700 sec - 2,057,638,891 cycles # 2.914 GHz - 2,939,721,478 instructions # 1.43 insn per cycle - 0.763589412 seconds time elapsed +TOTAL : 0.482102 sec + 1,996,662,355 cycles # 2.812 GHz + 2,850,200,230 instructions # 1.43 insn per cycle + 0.768087139 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.734181e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.846132e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.846132e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.596095e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.696763e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.696763e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 -TOTAL : 3.923431 sec - 11,754,442,627 cycles # 2.992 GHz - 35,109,278,353 instructions # 2.99 insn per cycle - 3.929236394 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.127183 sec + 11,946,394,247 cycles # 2.891 GHz + 35,407,075,530 instructions # 2.96 insn per cycle + 4.133301161 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 469) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -93,26 +93,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 -Avg ME (F77/C++) = 2.0288199094356969 -Relative difference = 4.463890496342449e-08 +Avg ME (F77/C++) = 2.0288199088536203 +Relative difference = 4.4925808981097166e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.529892e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.033657e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.033657e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.250434e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.927787e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.927787e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 -TOTAL : 1.982127 sec - 5,950,643,888 cycles # 2.995 GHz - 14,469,850,336 instructions # 2.43 insn per cycle - 1.987737448 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.766919 sec + 5,069,845,731 cycles # 2.861 GHz + 14,044,971,447 instructions # 2.77 insn per cycle + 1.773365949 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2487) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -120,26 +120,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 -Avg ME (F77/C++) = 2.0288193583255634 -Relative difference = 1.7661780742548925e-07 +Avg ME (F77/C++) = 2.0288192554144189 +Relative difference = 1.2589315209891237e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.598308e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.516031e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.516031e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.464407 sec - 4,161,674,169 cycles # 2.832 GHz - 8,874,846,514 instructions # 2.13 insn per cycle - 1.469970354 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.559784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.492213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.492213e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.476405 sec + 3,988,953,115 cycles # 2.692 GHz + 8,629,569,798 instructions # 2.16 insn per cycle + 1.482936821 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288180815987289 +Relative difference = 4.021983692325164e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.657230e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.559591e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.559591e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.454835 sec - 4,142,234,613 cycles # 2.837 GHz - 8,411,473,385 instructions # 2.03 insn per cycle - 1.460527785 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 8.210818e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.331985e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.331985e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414230e-03 ) GeV^0 +TOTAL : 1.366166 sec + 3,694,176,022 cycles # 2.694 GHz + 8,100,845,822 instructions # 2.19 insn per cycle + 1.372646371 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3105) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288182107033208 -Relative difference = 1.0385521077446488e-07 +Avg ME (F77/C++) = 2.0288180815987289 +Relative difference = 4.021983692325164e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.747619e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.236667e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.236667e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.670710e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.170464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.170464e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 -TOTAL : 1.909420 sec - 3,779,842,439 cycles # 1.975 GHz - 7,700,519,284 instructions # 2.04 insn per cycle - 1.914942288 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3435) (512y: 0) (512z: 2108) +TOTAL : 1.938240 sec + 3,580,879,514 cycles # 1.843 GHz + 7,373,942,234 instructions # 2.06 insn per cycle + 1.944698982 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2803) (512y: 1) (512z: 2230) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 -Avg ME (F77/C++) = 2.0288183204829693 -Relative difference = 1.5796536184903122e-07 +Avg ME (F77/C++) = 2.0288183569209650 +Relative difference = 1.7592557106041962e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index f4ab44e796..26cb412a69 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:32:45 +DATE: 2024-01-30_04:56:57 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.583584e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.155538e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269480e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.567190e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.153367e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271156e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.521409 sec - 2,266,982,107 cycles # 3.000 GHz - 3,245,259,859 instructions # 1.43 insn per cycle - 0.825828714 seconds time elapsed +TOTAL : 0.531785 sec + 2,166,596,506 cycles # 2.818 GHz + 3,096,992,570 instructions # 1.43 insn per cycle + 0.839064322 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.183966e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.247909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.247909e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.035137e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.096372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.096372e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.906531 sec - 15,256,207,680 cycles # 3.106 GHz - 38,576,110,907 instructions # 2.53 insn per cycle - 4.915147304 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.263186 sec + 15,248,441,904 cycles # 2.894 GHz + 39,293,765,746 instructions # 2.58 insn per cycle + 5.273287972 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 740) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.714814e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.918909e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.918909e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.565129e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.766484e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.766484e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.927400 sec - 8,963,531,758 cycles # 3.056 GHz - 24,224,066,775 instructions # 2.70 insn per cycle - 2.945563789 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.050997 sec + 8,847,131,595 cycles # 2.894 GHz + 24,093,216,326 instructions # 2.72 insn per cycle + 3.069927720 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2102) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.052463e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.591282e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.591282e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.446912e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.914435e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.914435e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.836853 sec - 5,386,059,012 cycles # 2.923 GHz - 11,276,681,340 instructions # 2.09 insn per cycle - 1.853860147 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) +TOTAL : 2.035886 sec + 5,501,574,982 cycles # 2.694 GHz + 11,449,152,902 instructions # 2.08 insn per cycle + 2.052044507 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.743199e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.418159e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.418159e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.398707e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.055840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.055840e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 1.658034 sec - 4,871,841,569 cycles # 2.928 GHz - 10,526,597,351 instructions # 2.16 insn per cycle - 1.674830124 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) +TOTAL : 1.750074 sec + 4,773,598,492 cycles # 2.718 GHz + 10,317,257,525 instructions # 2.16 insn per cycle + 1.763056572 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2076) (512y: 133) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.142731e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.384818e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.384818e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.115786e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.377584e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.377584e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.635826 sec - 5,341,588,845 cycles # 2.022 GHz - 7,603,619,006 instructions # 1.42 insn per cycle - 2.650860981 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) +TOTAL : 2.659793 sec + 4,851,599,101 cycles # 1.820 GHz + 7,367,812,046 instructions # 1.52 insn per cycle + 2.678537528 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1366) (512y: 69) (512z: 1611) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index 261980075b..3aadf8f9be 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2023-11-24_14:33:12 +DATE: 2024-01-30_04:57:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.569187e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154655e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.270741e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.571537e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158030e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273800e+08 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 0.519942 sec - 2,236,595,831 cycles # 2.972 GHz - 3,152,119,637 instructions # 1.41 insn per cycle - 0.821038019 seconds time elapsed +TOTAL : 0.527677 sec + 2,187,527,722 cycles # 2.838 GHz + 3,113,906,107 instructions # 1.42 insn per cycle + 0.843196902 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 @@ -72,20 +72,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.146577e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.209722e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.209722e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.053597e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.114429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.114429e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 4.990433 sec - 15,248,789,401 cycles # 3.053 GHz - 40,369,458,696 instructions # 2.65 insn per cycle - 4.998323590 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.216554 sec + 15,076,935,035 cycles # 2.887 GHz + 40,115,062,840 instructions # 2.66 insn per cycle + 5.225437216 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 630) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.998701e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.233932e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.233932e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.498695e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.695294e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.695294e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.726132 sec - 8,491,093,801 cycles # 3.108 GHz - 23,253,788,222 instructions # 2.74 insn per cycle - 2.739959213 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.107625 sec + 8,698,982,275 cycles # 2.794 GHz + 23,534,504,437 instructions # 2.71 insn per cycle + 3.124975720 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1993) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -126,20 +126,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.127833e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.510579e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.510579e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.826638e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.191418e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.191418e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.148617 sec - 6,247,078,338 cycles # 2.900 GHz - 12,963,549,380 instructions # 2.08 insn per cycle - 2.164643261 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) +TOTAL : 2.282934 sec + 6,198,059,216 cycles # 2.708 GHz + 13,103,377,766 instructions # 2.11 insn per cycle + 2.300648997 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2711) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -147,26 +147,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.424850e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.870009e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.870009e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.224417e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.653642e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.653642e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.038674 sec - 5,929,446,799 cycles # 2.902 GHz - 12,241,198,193 instructions # 2.06 insn per cycle - 2.052944380 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) +TOTAL : 2.117622 sec + 5,754,647,700 cycles # 2.709 GHz + 12,210,180,073 instructions # 2.12 insn per cycle + 2.133681313 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2201) (512y: 282) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -174,26 +174,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.928483e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.143845e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.143845e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.752218e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.971190e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.971190e+05 ) sec^-1 MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 -TOTAL : 2.772333 sec - 5,602,682,561 cycles # 2.017 GHz - 8,743,458,912 instructions # 1.56 insn per cycle - 2.786330437 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) +TOTAL : 2.905718 sec + 5,261,261,771 cycles # 1.807 GHz + 8,449,535,603 instructions # 1.61 insn per cycle + 2.918034623 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1324) (512y: 84) (512z: 1919) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -201,8 +201,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesse cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 -Avg ME (F77/C++) = 2.0288064057068964 -Relative difference = 2.9292737240031234e-07 +Avg ME (F77/C++) = 2.0288063930599014 +Relative difference = 2.9916108265801754e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 157dda07e9..93e04f110e 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:33:40 +DATE: 2024-01-30_04:57:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.750385e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.049934e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.064279e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.751466e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.044991e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.059567e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.462904 sec - 2,042,714,082 cycles # 3.011 GHz - 2,902,296,650 instructions # 1.42 insn per cycle - 0.758295254 seconds time elapsed +TOTAL : 0.471853 sec + 1,938,631,197 cycles # 2.818 GHz + 2,775,429,754 instructions # 1.43 insn per cycle + 0.768838740 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.085304e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.323684e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.337692e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.083310e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.323559e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.337755e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.603567 sec - 2,537,612,301 cycles # 3.015 GHz - 3,865,051,021 instructions # 1.52 insn per cycle - 0.900037358 seconds time elapsed +TOTAL : 0.612248 sec + 2,402,912,694 cycles # 2.815 GHz + 3,669,599,520 instructions # 1.53 insn per cycle + 0.914185147 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.562300e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.574662e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.574662e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.436781e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.449292e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.449292e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.416503 sec - 19,738,905,705 cycles # 3.075 GHz - 59,604,379,345 instructions # 3.02 insn per cycle - 6.423169029 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.748293 sec + 19,527,368,133 cycles # 2.892 GHz + 57,921,410,950 instructions # 2.97 insn per cycle + 6.756473501 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.963631e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.008800e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.008800e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.689715e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.736371e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.736371e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.322549 sec - 10,350,945,705 cycles # 3.112 GHz - 30,674,390,605 instructions # 2.96 insn per cycle - 3.334406966 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.517761 sec + 10,204,769,485 cycles # 2.897 GHz + 29,944,325,485 instructions # 2.93 insn per cycle + 3.533017528 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.645905e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.824459e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.824459e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.110539e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.290286e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.290286e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.720961 sec - 4,880,621,681 cycles # 2.829 GHz - 11,019,918,413 instructions # 2.26 insn per cycle - 1.732752875 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) +TOTAL : 1.822880 sec + 4,929,256,319 cycles # 2.697 GHz + 11,212,094,634 instructions # 2.27 insn per cycle + 1.842452367 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.100615e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.123065e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.123065e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.045459e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.068242e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.068242e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.510594 sec - 4,368,423,747 cycles # 2.884 GHz - 10,296,629,918 instructions # 2.36 insn per cycle - 1.523579096 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) +TOTAL : 1.591153 sec + 4,310,771,194 cycles # 2.701 GHz + 10,188,135,001 instructions # 2.36 insn per cycle + 1.604477930 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.837255e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.949435e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.949435e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.350984e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.465337e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.465337e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.114472 sec - 4,096,385,157 cycles # 1.934 GHz - 5,842,611,877 instructions # 1.43 insn per cycle - 2.127694185 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) +TOTAL : 2.255127 sec + 3,913,955,092 cycles # 1.732 GHz + 5,709,470,043 instructions # 1.46 insn per cycle + 2.269083887 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 62c84d1195..ec4707eb36 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_15:11:49 +DATE: 2024-01-30_05:47:33 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.639935e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.816863e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.816863e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.528893e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.736864e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.736864e+06 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.492213 sec - 2,054,207,791 cycles # 2.926 GHz - 3,126,128,113 instructions # 1.52 insn per cycle - 0.760942013 seconds time elapsed +TOTAL : 0.499721 sec + 2,019,319,467 cycles # 2.834 GHz + 3,049,308,251 instructions # 1.51 insn per cycle + 0.770515897 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.653176e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.472944e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.472944e+06 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.631733e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.469522e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.469522e+06 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.830881 sec - 3,123,864,419 cycles # 2.883 GHz - 4,825,503,220 instructions # 1.54 insn per cycle - 1.142577785 seconds time elapsed +TOTAL : 0.838079 sec + 3,105,645,423 cycles # 2.841 GHz + 4,885,001,867 instructions # 1.57 insn per cycle + 1.151319170 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.499287e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.511764e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.511764e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.430928e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.443345e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.443345e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.585609 sec - 19,745,466,264 cycles # 2.998 GHz - 59,612,197,800 instructions # 3.02 insn per cycle - 6.590135647 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.772792 sec + 19,550,332,735 cycles # 2.885 GHz + 57,928,238,854 instructions # 2.96 insn per cycle + 6.778111068 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1134) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,27 +120,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.693234e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.737492e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.737492e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.642090e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.688492e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.688492e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.521832 sec - 10,407,813,275 cycles # 2.952 GHz - 30,725,415,932 instructions # 2.95 insn per cycle - 3.526742122 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.562259 sec + 10,259,962,003 cycles # 2.883 GHz + 29,997,071,393 instructions # 2.92 insn per cycle + 3.567805037 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4742) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -155,20 +155,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.553194e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.735067e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.735067e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.060333e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.240360e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.240360e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.745455 sec - 4,924,157,638 cycles # 2.815 GHz - 11,068,084,114 instructions # 2.25 insn per cycle - 1.749966089 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) +TOTAL : 1.842606 sec + 4,975,429,359 cycles # 2.695 GHz + 11,262,132,806 instructions # 2.26 insn per cycle + 1.848498494 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4396) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -183,20 +183,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.070624e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.092463e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.092463e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.041344e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.064837e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.064837e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.559525 sec - 4,401,889,175 cycles # 2.815 GHz - 10,345,025,900 instructions # 2.35 insn per cycle - 1.564094276 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) +TOTAL : 1.605579 sec + 4,356,497,896 cycles # 2.706 GHz + 10,236,092,665 instructions # 2.35 insn per cycle + 1.611218031 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3895) (512y: 81) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -211,20 +211,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.394172e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.507838e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.507838e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.341333e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.457820e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.457820e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.248960 sec - 4,150,304,517 cycles # 1.842 GHz - 5,879,795,808 instructions # 1.42 insn per cycle - 2.253443263 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) +TOTAL : 2.270029 sec + 3,960,771,261 cycles # 1.743 GHz + 5,748,864,563 instructions # 1.45 insn per cycle + 2.275659808 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1258) (512y: 74) (512z: 3396) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 739a090c03..e0fcb209a0 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:34:09 +DATE: 2024-01-30_04:58:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.713263e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.038994e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.053292e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.715814e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.042075e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056833e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.459803 sec - 2,029,299,480 cycles # 3.004 GHz - 2,918,522,159 instructions # 1.44 insn per cycle - 0.740886124 seconds time elapsed +TOTAL : 0.470883 sec + 1,939,912,503 cycles # 2.822 GHz + 2,790,884,564 instructions # 1.44 insn per cycle + 0.765236939 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.076194e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.311479e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.325166e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.074401e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309128e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.323134e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.596867 sec - 2,506,078,107 cycles # 3.006 GHz - 3,672,565,526 instructions # 1.47 insn per cycle - 0.892288871 seconds time elapsed +TOTAL : 0.606508 sec + 2,399,848,951 cycles # 2.837 GHz + 3,558,977,452 instructions # 1.48 insn per cycle + 0.907497861 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.614658e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.627547e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.627547e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.442527e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.455052e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.455052e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.288241 sec - 19,506,895,795 cycles # 3.101 GHz - 58,795,912,300 instructions # 3.01 insn per cycle - 6.294901652 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.732470 sec + 19,518,863,765 cycles # 2.898 GHz + 57,747,544,085 instructions # 2.96 insn per cycle + 6.739693684 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1087) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213684432433 -Relative difference = 4.46923023397472e-07 +Avg ME (F77/C++) = 1.4131213684432431 +Relative difference = 4.4692302355460254e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.998673e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.044453e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.044453e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.661123e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.707073e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.707073e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.299411 sec - 10,248,689,037 cycles # 3.103 GHz - 30,346,682,819 instructions # 2.96 insn per cycle - 3.310933910 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.538386 sec + 10,268,038,737 cycles # 2.898 GHz + 30,334,584,369 instructions # 2.95 insn per cycle + 3.554140482 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.591359e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.761947e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.761947e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.842618e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.012045e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.012045e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.730287 sec - 5,042,983,525 cycles # 2.908 GHz - 11,483,783,187 instructions # 2.28 insn per cycle - 1.743862515 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) +TOTAL : 1.876874 sec + 5,068,616,518 cycles # 2.693 GHz + 11,664,707,542 instructions # 2.30 insn per cycle + 1.896780245 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4489) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.041838e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.061977e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.061977e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.766097e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.969139e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.969139e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.594766 sec - 4,637,285,824 cycles # 2.900 GHz - 10,841,968,745 instructions # 2.34 insn per cycle - 1.604083789 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) +TOTAL : 1.701579 sec + 4,623,474,911 cycles # 2.710 GHz + 10,806,178,257 instructions # 2.34 insn per cycle + 1.712732749 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3988) (512y: 237) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.531164e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.641315e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.641315e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.261988e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.377447e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.377447e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.200047 sec - 4,116,597,786 cycles # 1.868 GHz - 6,106,799,640 instructions # 1.48 insn per cycle - 2.213612018 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) +TOTAL : 2.282726 sec + 3,962,643,032 cycles # 1.733 GHz + 5,999,265,657 instructions # 1.51 insn per cycle + 2.297742409 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1241) (512y: 81) (512z: 3500) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index ee258b9eb8..809c0d4a45 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:34:38 +DATE: 2024-01-30_04:58:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.486813e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.354265e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.453893e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.450759e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.307242e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.403943e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.444260 sec - 1,976,391,201 cycles # 2.992 GHz - 2,802,325,698 instructions # 1.42 insn per cycle - 0.735914466 seconds time elapsed +TOTAL : 0.453655 sec + 1,885,130,441 cycles # 2.809 GHz + 2,653,723,410 instructions # 1.41 insn per cycle + 0.747134110 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.217656e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.394394e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.476515e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.211065e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.390139e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.474767e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.493371 sec - 2,193,458,897 cycles # 3.016 GHz - 3,142,646,768 instructions # 1.43 insn per cycle - 0.786643658 seconds time elapsed +TOTAL : 0.497953 sec + 2,053,184,300 cycles # 2.823 GHz + 2,862,941,904 instructions # 1.39 insn per cycle + 0.785494017 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,74 +86,74 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.645635e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.659236e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.659236e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.619709e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.634289e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.634289e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.214195 sec - 19,058,711,796 cycles # 3.065 GHz - 58,958,354,991 instructions # 3.09 insn per cycle - 6.220591504 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.276631 sec + 18,176,411,104 cycles # 2.894 GHz + 55,238,700,170 instructions # 3.04 insn per cycle + 6.284146623 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412998e+00 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.795314e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.946425e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.946425e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.447433e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.602543e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.602543e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.882613 sec - 5,847,309,673 cycles # 3.099 GHz - 16,694,109,682 instructions # 2.86 insn per cycle - 1.898696341 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.961648 sec + 5,691,843,956 cycles # 2.895 GHz + 16,128,541,176 instructions # 2.83 insn per cycle + 1.980848485 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129864902818952 +Relative difference = 3.469828399449743e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.901790e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.970007e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.970007e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.757867e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.823085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.823085e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.881272 sec - 2,581,984,725 cycles # 2.916 GHz - 5,980,308,545 instructions # 2.32 insn per cycle - 0.895784511 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) +TOTAL : 0.954501 sec + 2,591,810,421 cycles # 2.702 GHz + 6,085,915,267 instructions # 2.35 insn per cycle + 0.966912682 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.105287e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.189516e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.189516e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.986474e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.069956e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.069956e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.797874 sec - 2,345,612,557 cycles # 2.925 GHz - 5,602,963,634 instructions # 2.39 insn per cycle - 0.810229657 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) +TOTAL : 0.846832 sec + 2,295,114,840 cycles # 2.696 GHz + 5,552,751,365 instructions # 2.42 insn per cycle + 0.861502194 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.549899e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.597374e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.597374e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.460942e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.506292e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.506292e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.080161 sec - 2,054,065,161 cycles # 1.894 GHz - 3,334,275,922 instructions # 1.62 insn per cycle - 1.095623221 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) +TOTAL : 1.145570 sec + 2,022,184,795 cycles # 1.758 GHz + 3,286,748,929 instructions # 1.63 insn per cycle + 1.163999883 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index d8dc2c3678..8f1e29c773 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_15:12:19 +DATE: 2024-01-30_05:48:04 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.001518e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.137858e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.137858e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.794241e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.099961e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.099961e+07 ) sec^-1 MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 -TOTAL : 0.455815 sec - 1,948,301,357 cycles # 2.927 GHz - 2,907,177,849 instructions # 1.49 insn per cycle - 0.723196787 seconds time elapsed +TOTAL : 0.464854 sec + 1,913,128,801 cycles # 2.831 GHz + 2,814,269,280 instructions # 1.47 insn per cycle + 0.735191571 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.698909e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.564625e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.564625e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.563056e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.567773e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.567773e+07 ) sec^-1 MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 -TOTAL : 0.639136 sec - 2,545,540,064 cycles # 2.932 GHz - 3,908,776,443 instructions # 1.54 insn per cycle - 0.927730276 seconds time elapsed +TOTAL : 0.649992 sec + 2,514,728,119 cycles # 2.840 GHz + 3,857,856,675 instructions # 1.53 insn per cycle + 0.945286461 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,76 +99,76 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.559629e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.572951e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.572951e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.612332e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.626818e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.626818e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.425977 sec - 19,091,912,214 cycles # 2.972 GHz - 58,967,324,889 instructions # 3.09 insn per cycle - 6.430306421 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.299552 sec + 18,207,767,275 cycles # 2.889 GHz + 55,242,943,760 instructions # 3.03 insn per cycle + 6.304483382 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1229) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412998e+00 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.438378e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.588398e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.588398e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.365917e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.522444e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.522444e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.967455 sec - 5,879,551,760 cycles # 2.983 GHz - 16,741,960,213 instructions # 2.85 insn per cycle - 1.971863675 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.986287 sec + 5,717,011,577 cycles # 2.873 GHz + 16,175,954,346 instructions # 2.83 insn per cycle + 1.991587162 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5205) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129864902818952 +Relative difference = 3.469828399449743e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.817320e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.883646e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.883646e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.741687e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.807547e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.807547e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.926249 sec - 2,607,056,313 cycles # 2.804 GHz - 6,016,570,943 instructions # 2.31 insn per cycle - 0.930566113 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) +TOTAL : 0.968315 sec + 2,618,792,433 cycles # 2.693 GHz + 6,122,206,815 instructions # 2.34 insn per cycle + 0.973667021 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4878) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,27 +176,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.015882e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.096022e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.096022e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.976057e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.060749e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.060749e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.837462 sec - 2,364,091,843 cycles # 2.810 GHz - 5,638,984,109 instructions # 2.39 insn per cycle - 0.841852545 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) +TOTAL : 0.855993 sec + 2,321,654,642 cycles # 2.699 GHz + 5,589,002,861 instructions # 2.41 insn per cycle + 0.861171520 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4415) (512y: 30) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -204,27 +204,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.519186e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.565700e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.565700e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.455132e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.500155e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.500155e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.105946 sec - 2,075,898,441 cycles # 1.871 GHz - 3,374,696,524 instructions # 1.63 insn per cycle - 1.110329864 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) +TOTAL : 1.154878 sec + 2,044,999,339 cycles # 1.765 GHz + 3,327,504,110 instructions # 1.63 insn per cycle + 1.160035358 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1905) (512y: 28) (512z: 3597) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -232,8 +232,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 69970d8c55..71f99cc0f9 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:35:02 +DATE: 2024-01-30_04:59:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.373914e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.228922e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.325879e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.454028e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.326749e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.426065e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 -TOTAL : 0.443366 sec - 2,003,938,604 cycles # 2.996 GHz - 2,792,184,039 instructions # 1.39 insn per cycle - 0.740353888 seconds time elapsed +TOTAL : 0.451891 sec + 1,884,922,304 cycles # 2.826 GHz + 2,675,153,942 instructions # 1.42 insn per cycle + 0.742708600 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.237434e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.431964e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.514879e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.211971e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.383449e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.465566e+07 ) sec^-1 MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 -TOTAL : 0.491294 sec - 2,181,970,196 cycles # 3.018 GHz - 3,127,779,956 instructions # 1.43 insn per cycle - 0.781703991 seconds time elapsed +TOTAL : 0.498447 sec + 2,066,378,383 cycles # 2.841 GHz + 2,912,189,828 instructions # 1.41 insn per cycle + 0.785425719 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,74 +86,74 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.673738e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.687597e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.687597e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.621420e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.635929e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.635929e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 6.148417 sec - 18,978,547,749 cycles # 3.085 GHz - 58,701,989,638 instructions # 3.09 insn per cycle - 6.154751887 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.272536 sec + 18,133,908,438 cycles # 2.889 GHz + 54,991,536,969 instructions # 3.03 insn per cycle + 6.280002049 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1171) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412986e+00 -Avg ME (F77/C++) = 1.4129858051842916 -Relative difference = 1.3787518662898538e-07 +Avg ME (C++/C++) = 1.412998e+00 +Avg ME (F77/C++) = 1.4129977771372637 +Relative difference = 1.5772332039074602e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.234838e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.399257e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.399257e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.675526e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.845155e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.845155e+04 ) sec^-1 MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 -TOTAL : 1.793566 sec - 5,583,260,034 cycles # 3.106 GHz - 16,511,055,213 instructions # 2.96 insn per cycle - 1.805973554 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.910252 sec + 5,541,476,355 cycles # 2.894 GHz + 16,222,950,904 instructions # 2.93 insn per cycle + 1.926546393 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 5136) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 1.412987e+00 -Avg ME (F77/C++) = 1.4129865669244737 -Relative difference = 3.06496469061158e-07 +Avg ME (C++/C++) = 1.412986e+00 +Avg ME (F77/C++) = 1.4129863487235070 +Relative difference = 2.4679898241023883e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.640420e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.690986e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.690986e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.524928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.573974e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.573974e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 1.018575 sec - 2,975,461,813 cycles # 2.909 GHz - 6,634,551,571 instructions # 2.23 insn per cycle - 1.032313323 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) +TOTAL : 1.096795 sec + 2,981,881,341 cycles # 2.708 GHz + 6,708,240,605 instructions # 2.25 insn per cycle + 1.109848469 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5430) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.779474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.839054e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.839054e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.679205e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.738776e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.738776e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 -TOTAL : 0.940459 sec - 2,751,956,582 cycles # 2.914 GHz - 6,255,845,013 instructions # 2.27 insn per cycle - 0.955633764 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) +TOTAL : 0.997879 sec + 2,711,169,290 cycles # 2.704 GHz + 6,222,713,478 instructions # 2.30 insn per cycle + 1.012945753 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5056) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133161655815059 -Relative difference = 1.1715816267550621e-07 +Avg ME (F77/C++) = 1.4133158486847037 +Relative difference = 1.0706402269051248e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.472370e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.513955e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.513955e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.374736e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.414577e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.414577e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 -TOTAL : 1.135039 sec - 2,225,305,054 cycles # 1.953 GHz - 3,698,319,936 instructions # 1.66 insn per cycle - 1.145678367 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2378) (512y: 29) (512z: 3963) +TOTAL : 1.216016 sec + 2,159,440,418 cycles # 1.769 GHz + 3,642,249,109 instructions # 1.69 insn per cycle + 1.228978695 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2070) (512y: 21) (512z: 3922) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 -Avg ME (F77/C++) = 1.4133164033579249 -Relative difference = 2.85398258307829e-07 +Avg ME (F77/C++) = 1.4133164031689205 +Relative difference = 2.852645271622733e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index ecdaa6bbe5..c3bf1d184f 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:35:27 +DATE: 2024-01-30_04:59:47 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.701296e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.042819e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.057341e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.711100e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.041363e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.056144e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.460305 sec - 2,033,483,355 cycles # 3.015 GHz - 2,914,097,696 instructions # 1.43 insn per cycle - 0.748064785 seconds time elapsed +TOTAL : 0.470768 sec + 1,937,905,575 cycles # 2.825 GHz + 2,769,085,725 instructions # 1.43 insn per cycle + 0.764309757 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.081681e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.313999e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.327824e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.077034e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.312199e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.326400e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.605547 sec - 2,520,007,236 cycles # 2.997 GHz - 3,773,324,991 instructions # 1.50 insn per cycle - 0.902409459 seconds time elapsed +TOTAL : 0.614129 sec + 2,415,403,751 cycles # 2.830 GHz + 3,662,132,699 instructions # 1.52 insn per cycle + 0.915037914 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.516282e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.528433e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.528433e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.370924e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.382846e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.382846e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.534539 sec - 20,013,949,237 cycles # 3.061 GHz - 60,533,001,946 instructions # 3.02 insn per cycle - 6.541112859 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.936117 sec + 19,978,394,912 cycles # 2.879 GHz + 59,162,561,873 instructions # 2.96 insn per cycle + 6.944191465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1149) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.912613e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.958884e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.958884e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.694585e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.741387e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.741387e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.357235 sec - 10,182,360,341 cycles # 3.029 GHz - 30,384,902,169 instructions # 2.98 insn per cycle - 3.367308670 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.513998 sec + 10,104,341,088 cycles # 2.872 GHz + 29,763,867,436 instructions # 2.95 insn per cycle + 3.532062820 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4873) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.926374e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.010833e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.010833e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.157849e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.336120e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.336120e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.672181 sec - 4,864,081,957 cycles # 2.901 GHz - 10,979,232,419 instructions # 2.26 insn per cycle - 1.684140082 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) +TOTAL : 1.813275 sec + 4,888,809,789 cycles # 2.689 GHz + 11,200,775,616 instructions # 2.29 insn per cycle + 1.831194346 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4581) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.137450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.161103e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.161103e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.059295e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.083013e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.083013e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.462186 sec - 4,272,825,074 cycles # 2.914 GHz - 10,248,981,944 instructions # 2.40 insn per cycle - 1.474897364 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) +TOTAL : 1.571072 sec + 4,240,948,322 cycles # 2.691 GHz + 10,146,075,765 instructions # 2.39 insn per cycle + 1.585395140 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4064) (512y: 73) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.677106e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.786537e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.786537e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.157625e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.268151e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.268151e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.158029 sec - 4,199,815,524 cycles # 1.943 GHz - 6,043,516,368 instructions # 1.44 insn per cycle - 2.169397232 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) +TOTAL : 2.315387 sec + 4,011,221,101 cycles # 1.729 GHz + 5,838,969,816 instructions # 1.46 insn per cycle + 2.328222904 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1778) (512y: 97) (512z: 3502) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index f9629437a1..0465a21327 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2023-11-24_14:35:56 +DATE: 2024-01-30_05:00:18 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.718207e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.043711e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.058000e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.666023e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.032901e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.046936e+07 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 0.460190 sec - 2,020,498,162 cycles # 3.001 GHz - 2,893,283,708 instructions # 1.43 insn per cycle - 0.740481359 seconds time elapsed +TOTAL : 0.468226 sec + 1,937,873,508 cycles # 2.824 GHz + 2,759,069,461 instructions # 1.42 insn per cycle + 0.754370762 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.069988e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.302141e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.315703e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.070939e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.304690e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318717e+07 ) sec^-1 MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 -TOTAL : 0.603031 sec - 2,496,488,169 cycles # 2.975 GHz - 3,745,992,997 instructions # 1.50 insn per cycle - 0.898963383 seconds time elapsed +TOTAL : 0.607643 sec + 2,403,195,178 cycles # 2.827 GHz + 3,555,740,714 instructions # 1.48 insn per cycle + 0.909921855 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.573744e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.586335e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.586335e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.404971e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.417275e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.417275e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 6.388339 sec - 19,817,061,675 cycles # 3.101 GHz - 59,934,412,865 instructions # 3.02 insn per cycle - 6.394993907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.840299 sec + 19,736,673,501 cycles # 2.886 GHz + 58,709,690,472 instructions # 2.97 insn per cycle + 6.847451518 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1026) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.068721e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.116534e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.116534e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.708829e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.755468e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.755468e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 3.254189 sec - 10,074,392,918 cycles # 3.092 GHz - 30,097,970,506 instructions # 2.99 insn per cycle - 3.269644857 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.503073 sec + 10,118,973,746 cycles # 2.885 GHz + 30,158,905,101 instructions # 2.98 insn per cycle + 3.519090284 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4944) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.551013e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.718637e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.718637e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.784663e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.950747e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.950747e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.737428 sec - 5,013,997,017 cycles # 2.879 GHz - 11,483,477,844 instructions # 2.29 insn per cycle - 1.752998506 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) +TOTAL : 1.889159 sec + 5,039,949,395 cycles # 2.661 GHz + 11,663,409,755 instructions # 2.31 insn per cycle + 1.981495827 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4685) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.052618e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.072863e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.072863e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.838137e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.004758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.004758e+05 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 1.578372 sec - 4,590,458,207 cycles # 2.900 GHz - 10,810,507,291 instructions # 2.35 insn per cycle - 1.591783419 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) +TOTAL : 1.689521 sec + 4,555,347,979 cycles # 2.689 GHz + 10,787,640,248 instructions # 2.37 insn per cycle + 1.702819632 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4159) (512y: 233) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.627027e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.731488e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.731488e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.077813e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.181685e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.181685e+04 ) sec^-1 MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 -TOTAL : 2.171750 sec - 4,214,575,835 cycles # 1.937 GHz - 6,273,425,611 instructions # 1.49 insn per cycle - 2.183570227 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) +TOTAL : 2.340568 sec + 4,064,413,524 cycles # 1.733 GHz + 6,073,601,897 instructions # 1.49 insn per cycle + 2.356439472 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1725) (512y: 104) (512z: 3609) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 -Avg ME (F77/C++) = 1.4131213786174055 -Relative difference = 4.3972324717191576e-07 +Avg ME (F77/C++) = 1.4131213600217192 +Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 9b25288b8a..53bd28a5bd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:36:25 +DATE: 2024-01-30_05:00:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.502935e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.531164e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.533426e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.507010e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.536029e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.538733e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.519828 sec - 2,267,325,261 cycles # 3.026 GHz - 3,564,664,052 instructions # 1.57 insn per cycle - 0.820257033 seconds time elapsed +TOTAL : 0.531942 sec + 2,193,462,671 cycles # 2.834 GHz + 3,356,973,773 instructions # 1.53 insn per cycle + 0.849346656 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.129197e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.163245e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.164643e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.126743e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.160620e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.162100e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.029980 sec - 10,118,272,870 cycles # 3.068 GHz - 21,084,266,771 instructions # 2.08 insn per cycle - 3.354068412 seconds time elapsed +TOTAL : 3.043092 sec + 9,489,937,514 cycles # 2.875 GHz + 19,463,317,431 instructions # 2.05 insn per cycle + 3.359518634 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.983194e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.984194e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.984194e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.787937e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.788754e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.788754e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.279983 sec - 25,558,009,617 cycles # 3.086 GHz - 78,936,663,909 instructions # 3.09 insn per cycle - 8.286888065 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.184792 sec + 26,445,376,310 cycles # 2.879 GHz + 81,759,262,253 instructions # 3.09 insn per cycle + 9.200099621 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.793332e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.796868e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.796868e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.595033e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598347e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598347e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.332856 sec - 12,910,305,211 cycles # 2.977 GHz - 39,281,355,341 instructions # 3.04 insn per cycle - 4.346559861 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.572055 sec + 12,894,491,420 cycles # 2.818 GHz + 39,242,650,330 instructions # 3.04 insn per cycle + 4.588188651 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.457803e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.475319e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.475319e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.988905e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.005063e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.005063e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.950206 sec - 5,559,743,200 cycles # 2.847 GHz - 13,686,596,932 instructions # 2.46 insn per cycle - 2.032901077 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 2.062926 sec + 5,559,157,847 cycles # 2.689 GHz + 13,789,744,695 instructions # 2.48 insn per cycle + 2.079268197 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.800481e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.822076e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.822076e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.113130e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.134504e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.134504e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.683777 sec - 4,896,997,214 cycles # 2.903 GHz - 12,341,617,992 instructions # 2.52 insn per cycle - 1.697194330 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +TOTAL : 1.809806 sec + 4,899,980,729 cycles # 2.701 GHz + 12,319,200,932 instructions # 2.51 insn per cycle + 1.824526773 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.669824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.683748e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.683748e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.926484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.938620e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.938620e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.147707 sec - 4,120,375,944 cycles # 1.915 GHz - 6,335,709,132 instructions # 1.54 insn per cycle - 2.163638241 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +TOTAL : 2.377893 sec + 4,078,713,187 cycles # 1.712 GHz + 6,287,612,851 instructions # 1.54 insn per cycle + 2.391138362 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 3e466c9dbd..ba45d149aa 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:13:18 +DATE: 2024-01-30_05:49:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.151810e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.491547e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.491547e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.099881e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.447326e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.447326e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.513888 sec - 2,172,731,633 cycles # 2.929 GHz - 3,459,636,945 instructions # 1.59 insn per cycle - 0.802070844 seconds time elapsed +TOTAL : 0.520843 sec + 2,128,361,154 cycles # 2.833 GHz + 3,379,769,914 instructions # 1.59 insn per cycle + 0.811374229 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.626462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.109948e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.109948e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.602295e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.096469e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.096469e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.316369 sec - 10,702,439,296 cycles # 2.980 GHz - 23,311,698,840 instructions # 2.18 insn per cycle - 3.648833219 seconds time elapsed +TOTAL : 3.329754 sec + 10,358,756,104 cycles # 2.872 GHz + 22,944,085,739 instructions # 2.21 insn per cycle + 3.663397648 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.926732e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.927660e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.927660e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.794771e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.795632e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.795632e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.525962 sec - 25,585,926,017 cycles # 3.000 GHz - 78,942,573,288 instructions # 3.09 insn per cycle - 8.530463803 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.153536 sec + 26,441,951,782 cycles # 2.888 GHz + 81,759,972,796 instructions # 3.09 insn per cycle + 9.158879879 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -127,20 +127,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.644154e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.647807e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.647807e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.577595e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.580993e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.580993e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.514194 sec - 12,913,516,064 cycles # 2.858 GHz - 39,293,095,640 instructions # 3.04 insn per cycle - 4.518821334 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.598491 sec + 12,916,287,273 cycles # 2.806 GHz + 39,254,753,938 instructions # 3.04 insn per cycle + 4.603937867 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -155,20 +155,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.272317e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.289989e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.289989e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.852795e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.869019e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.869019e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.996165 sec - 5,572,431,089 cycles # 2.786 GHz - 13,696,683,094 instructions # 2.46 insn per cycle - 2.000824809 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 2.103011 sec + 5,568,678,671 cycles # 2.642 GHz + 13,799,771,926 instructions # 2.48 insn per cycle + 2.108561686 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,27 +176,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.416662e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.438491e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.438491e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.035305e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.056800e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.056800e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.755183 sec - 4,929,995,053 cycles # 2.803 GHz - 12,351,698,776 instructions # 2.51 insn per cycle - 1.759689895 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +TOTAL : 1.829572 sec + 4,921,598,332 cycles # 2.684 GHz + 12,328,469,851 instructions # 2.50 insn per cycle + 1.835230648 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -204,27 +204,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.299463e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.313710e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.313710e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.926825e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.939647e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.939647e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.260606 sec - 4,129,998,329 cycles # 1.824 GHz - 6,345,886,658 instructions # 1.54 insn per cycle - 2.265252390 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +TOTAL : 2.382359 sec + 4,075,002,441 cycles # 1.707 GHz + 6,297,411,526 instructions # 1.55 insn per cycle + 2.387952463 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -232,8 +232,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index 22d785bafe..2624aa384f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:25:10 +DATE: 2024-01-30_06:01:23 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.505435e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.534142e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.536887e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.497090e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.524372e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.526818e+05 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.512729 sec - 2,187,113,796 cycles # 2.953 GHz - 3,321,864,960 instructions # 1.52 insn per cycle - 0.803004683 seconds time elapsed +TOTAL : 0.512897 sec + 2,098,983,374 cycles # 2.834 GHz + 3,277,353,449 instructions # 1.56 insn per cycle + 0.803360908 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.143311e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.178194e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.179712e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.141054e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.174803e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176256e+05 ) sec^-1 MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 -TOTAL : 3.127476 sec - 10,173,806,494 cycles # 3.006 GHz - 22,940,336,221 instructions # 2.25 insn per cycle - 3.440988376 seconds time elapsed +TOTAL : 3.134385 sec + 9,742,325,189 cycles # 2.872 GHz + 21,219,396,735 instructions # 2.18 insn per cycle + 3.451782991 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.944681e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.945656e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.945656e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.789322e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.790133e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.790133e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 8.444658 sec - 25,580,623,755 cycles # 3.028 GHz - 78,936,122,811 instructions # 3.09 insn per cycle - 8.449050059 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.178912 sec + 26,467,085,384 cycles # 2.885 GHz + 81,758,395,147 instructions # 3.09 insn per cycle + 9.184185479 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.654154e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.657693e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.657693e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.580434e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.583873e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.583873e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.499225 sec - 12,919,732,335 cycles # 2.870 GHz - 39,280,003,953 instructions # 3.04 insn per cycle - 4.503503119 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.592041 sec + 12,908,303,532 cycles # 2.809 GHz + 39,241,301,392 instructions # 3.04 insn per cycle + 4.597199751 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.335985e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.352449e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.352449e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.006274e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.022952e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.022952e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.978535 sec - 5,565,762,005 cycles # 2.808 GHz - 13,684,790,936 instructions # 2.46 insn per cycle - 1.982835811 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 2.060575 sec + 5,561,277,799 cycles # 2.694 GHz + 13,787,529,346 instructions # 2.48 insn per cycle + 2.065507699 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.460859e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.482779e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.482779e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.108001e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.130506e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.130506e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.744747 sec - 4,901,917,953 cycles # 2.804 GHz - 12,339,116,177 instructions # 2.52 insn per cycle - 1.749045721 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +TOTAL : 1.812494 sec + 4,903,037,786 cycles # 2.699 GHz + 12,315,866,756 instructions # 2.51 insn per cycle + 1.817504411 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.321634e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.334985e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.334985e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.888313e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.900941e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.900941e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 2.251368 sec - 4,121,273,998 cycles # 1.828 GHz - 6,332,363,558 instructions # 1.54 insn per cycle - 2.255694764 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +TOTAL : 2.393126 sec + 4,056,497,728 cycles # 1.692 GHz + 6,284,230,028 instructions # 1.55 insn per cycle + 2.398190383 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 0edaf6e67f..711141aac6 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:21:49 +DATE: 2024-01-30_05:57:54 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.504189e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.533338e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.535687e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.493459e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.521487e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.524248e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.507565 sec - 2,140,644,436 cycles # 2.903 GHz - 3,294,278,423 instructions # 1.54 insn per cycle - 0.797247733 seconds time elapsed +TOTAL : 0.509745 sec + 2,103,800,649 cycles # 2.836 GHz + 3,325,789,020 instructions # 1.58 insn per cycle + 0.801087014 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.134584e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.168777e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.170193e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.145753e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.180203e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.181702e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.073697 sec - 9,988,823,734 cycles # 2.997 GHz - 22,688,491,791 instructions # 2.27 insn per cycle - 3.391117560 seconds time elapsed +TOTAL : 3.079125 sec + 9,600,126,952 cycles # 2.878 GHz + 21,681,876,510 instructions # 2.26 insn per cycle + 3.393235673 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.920556e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.921467e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.921467e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.797758e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.798583e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.798583e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.549047 sec - 25,566,043,933 cycles # 2.990 GHz - 78,938,822,238 instructions # 3.09 insn per cycle - 8.553277802 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.135643 sec + 26,454,582,305 cycles # 2.896 GHz + 81,754,058,548 instructions # 3.09 insn per cycle + 9.140745485 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.702947e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.706522e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.706522e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.597207e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.600539e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.600539e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.438616 sec - 12,919,298,910 cycles # 2.909 GHz - 39,279,556,753 instructions # 3.04 insn per cycle - 4.442989895 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.569387 sec + 12,892,653,048 cycles # 2.819 GHz + 39,241,760,724 instructions # 3.04 insn per cycle + 4.574378716 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.327524e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.345264e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.345264e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.978625e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.995447e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.995447e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.979008 sec - 5,559,052,885 cycles # 2.804 GHz - 13,685,718,134 instructions # 2.46 insn per cycle - 1.983236201 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 2.065609 sec + 5,559,302,417 cycles # 2.687 GHz + 13,789,202,442 instructions # 2.48 insn per cycle + 2.071000161 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.547712e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.570537e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.570537e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.097696e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.119952e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.119952e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.726945 sec - 4,895,458,327 cycles # 2.829 GHz - 12,340,699,922 instructions # 2.52 insn per cycle - 1.731178551 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +TOTAL : 1.812868 sec + 4,896,837,509 cycles # 2.695 GHz + 12,317,770,581 instructions # 2.52 insn per cycle + 1.818257681 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.393984e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.407662e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.407662e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.967466e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.979997e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.979997e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.227409 sec - 4,117,035,910 cycles # 1.846 GHz - 6,334,322,153 instructions # 1.54 insn per cycle - 2.231712529 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +TOTAL : 2.364567 sec + 4,060,623,360 cycles # 1.715 GHz + 6,286,167,500 instructions # 1.55 insn per cycle + 2.369629620 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index 3fd3545f79..de6151d7b3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:18:31 +DATE: 2024-01-30_05:54:28 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.205483e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.519039e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.521349e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.181803e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.496640e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.499302e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.513661 sec - 2,155,662,048 cycles # 2.911 GHz - 3,403,110,868 instructions # 1.58 insn per cycle - 0.802490530 seconds time elapsed +TOTAL : 0.513215 sec + 2,110,243,378 cycles # 2.841 GHz + 3,364,158,559 instructions # 1.59 insn per cycle + 0.803846009 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -68,17 +68,17 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.731558e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.171074e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.172534e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.724341e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.176977e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.178501e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.204951 sec - 10,303,132,790 cycles # 2.977 GHz - 23,518,550,603 instructions # 2.28 insn per cycle - 3.516665455 seconds time elapsed +TOTAL : 3.211617 sec + 9,930,008,722 cycles # 2.863 GHz + 21,629,593,771 instructions # 2.18 insn per cycle + 3.536993543 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -89,20 +89,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.916291e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.917249e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.917249e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.795958e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.796814e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.796814e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.568789 sec - 25,583,500,760 cycles # 2.987 GHz - 78,939,465,924 instructions # 3.09 insn per cycle - 8.573085981 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.141774 sec + 26,442,082,623 cycles # 2.892 GHz + 81,755,899,902 instructions # 3.09 insn per cycle + 9.146895276 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6614) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -116,20 +116,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.689818e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.693400e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.693400e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.584252e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.587667e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.587667e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.453935 sec - 12,901,075,488 cycles # 2.894 GHz - 39,279,441,065 instructions # 3.04 insn per cycle - 4.458436593 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.585664 sec + 12,903,354,074 cycles # 2.812 GHz + 39,243,037,589 instructions # 3.04 insn per cycle + 4.591083081 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12814) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -143,20 +143,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.313071e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.330635e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.330635e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.993074e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.009513e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.009513e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.982693 sec - 5,557,103,226 cycles # 2.798 GHz - 13,686,016,474 instructions # 2.46 insn per cycle - 1.987071584 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) +TOTAL : 2.061753 sec + 5,556,410,491 cycles # 2.690 GHz + 13,788,754,708 instructions # 2.48 insn per cycle + 2.066810636 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11059) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -164,26 +164,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.334052e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.355862e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.355862e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.089775e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.111272e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.111272e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.766820 sec - 4,905,253,884 cycles # 2.771 GHz - 12,341,018,898 instructions # 2.52 insn per cycle - 1.771164761 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) +TOTAL : 1.814284 sec + 4,898,229,262 cycles # 2.694 GHz + 12,317,871,193 instructions # 2.51 insn per cycle + 1.819291757 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9762) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -191,26 +191,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.271568e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.284665e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.284665e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.893591e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.906421e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.906421e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.265152 sec - 4,121,020,497 cycles # 1.817 GHz - 6,334,252,603 instructions # 1.54 insn per cycle - 2.269401421 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) +TOTAL : 2.392306 sec + 4,056,818,337 cycles # 1.695 GHz + 6,287,135,022 instructions # 1.55 insn per cycle + 2.397424437 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1516) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -218,8 +218,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index 98140f2185..ce8b9bfd9b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:37:01 +DATE: 2024-01-30_05:01:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.470716e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.498661e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.501013e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.464704e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.493618e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.496312e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.519808 sec - 2,262,549,095 cycles # 3.015 GHz - 3,507,447,767 instructions # 1.55 insn per cycle - 0.822215819 seconds time elapsed +TOTAL : 0.530379 sec + 2,191,459,528 cycles # 2.836 GHz + 3,378,194,635 instructions # 1.54 insn per cycle + 0.862349447 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.130084e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164129e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.165510e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.136041e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.170363e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.171805e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.030552 sec - 10,073,054,045 cycles # 3.064 GHz - 22,850,649,130 instructions # 2.27 insn per cycle - 3.344535702 seconds time elapsed +TOTAL : 3.033894 sec + 9,468,330,012 cycles # 2.874 GHz + 21,262,061,450 instructions # 2.25 insn per cycle + 3.350179318 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.990080e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.991047e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.991047e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.798336e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.799213e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.799213e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.251874 sec - 25,580,916,000 cycles # 3.099 GHz - 78,707,540,592 instructions # 3.08 insn per cycle - 8.258581252 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.130694 sec + 26,439,863,153 cycles # 2.895 GHz + 81,781,637,155 instructions # 3.09 insn per cycle + 9.163718345 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6589) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.671887e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.674997e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.674997e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.559639e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.562995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.562995e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.475213 sec - 12,954,555,110 cycles # 2.897 GHz - 39,230,579,465 instructions # 3.03 insn per cycle - 4.487081742 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.616847 sec + 12,919,257,236 cycles # 2.796 GHz + 39,249,733,665 instructions # 3.04 insn per cycle + 4.636578065 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12771) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.529282e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.546223e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.546223e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.030089e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.046612e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.046612e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.932123 sec - 5,603,928,357 cycles # 2.895 GHz - 13,800,807,908 instructions # 2.46 insn per cycle - 1.945155802 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) +TOTAL : 2.052281 sec + 5,556,604,473 cycles # 2.701 GHz + 13,805,088,947 instructions # 2.48 insn per cycle + 2.071717259 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11048) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.635530e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.657625e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.657625e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.135265e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.157006e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.157006e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.712282 sec - 4,956,507,680 cycles # 2.889 GHz - 12,466,691,438 instructions # 2.52 insn per cycle - 1.725471807 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) +TOTAL : 1.804981 sec + 4,885,090,375 cycles # 2.700 GHz + 12,330,030,988 instructions # 2.52 insn per cycle + 1.821790981 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9736) (512y: 94) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.639925e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.653575e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.653575e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.917661e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.930225e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.930225e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.155986 sec - 4,118,224,744 cycles # 1.907 GHz - 6,458,752,156 instructions # 1.57 insn per cycle - 2.169032093 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) +TOTAL : 2.381269 sec + 4,053,625,505 cycles # 1.699 GHz + 6,293,972,632 instructions # 1.55 insn per cycle + 2.398074513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1497) (512y: 94) (512z: 9019) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198157320E-004 -Relative difference = 2.837296634927675e-07 +Avg ME (F77/C++) = 6.6266731198157309E-004 +Relative difference = 2.837296636563793e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 9bedb53e70..466f11943e 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:02:17 +DATE: 2024-01-30_05:37:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.232352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.258036e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.260266e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.224805e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.249067e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.252384e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.533076 sec - 2,264,463,020 cycles # 2.941 GHz - 3,538,774,343 instructions # 1.56 insn per cycle - 0.826602256 seconds time elapsed +TOTAL : 0.539977 sec + 2,169,108,553 cycles # 2.827 GHz + 3,309,870,321 instructions # 1.53 insn per cycle + 0.827427226 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.769231e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.796782e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.797908e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.771192e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.799798e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.801021e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.296720 sec - 10,445,857,222 cycles # 2.932 GHz - 21,607,313,168 instructions # 2.07 insn per cycle - 3.618781692 seconds time elapsed +TOTAL : 3.309705 sec + 10,258,793,873 cycles # 2.876 GHz + 23,623,503,831 instructions # 2.30 insn per cycle + 3.624842760 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.334345e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.334819e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.334819e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.186471e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.186937e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.186937e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 37.848381 sec - 113,421,106,616 cycles # 2.997 GHz - 144,959,874,698 instructions # 1.28 insn per cycle - 37.852852480 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21301) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 39.185516 sec + 112,945,518,025 cycles # 2.882 GHz + 141,519,786,794 instructions # 1.25 insn per cycle + 39.190901211 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21365) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140461E-004 +Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.164523e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.167012e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.167012e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.072790e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.075243e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.075243e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.191844 sec - 14,732,312,446 cycles # 2.836 GHz - 37,575,149,913 instructions # 2.55 insn per cycle - 5.196266852 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.347436 sec + 14,950,247,924 cycles # 2.794 GHz + 37,533,141,644 instructions # 2.51 insn per cycle + 5.352716029 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68052) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141209E-004 -Relative difference = 2.8372990661989057e-07 +Avg ME (F77/C++) = 6.6266731198141220E-004 +Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.486864e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.501204e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.501204e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.349404e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.363561e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.363561e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.200274 sec - 6,167,482,001 cycles # 2.799 GHz - 13,061,737,270 instructions # 2.12 insn per cycle - 2.204748277 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) +TOTAL : 2.242056 sec + 6,032,020,393 cycles # 2.685 GHz + 12,947,712,227 instructions # 2.15 insn per cycle + 2.247452761 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:46593) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.239505e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.261064e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.261064e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.895381e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.916043e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.916043e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.784853 sec - 5,062,751,449 cycles # 2.830 GHz - 11,440,329,026 instructions # 2.26 insn per cycle - 1.789370916 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) +TOTAL : 1.857617 sec + 4,999,907,297 cycles # 2.689 GHz + 11,364,404,504 instructions # 2.27 insn per cycle + 1.863061758 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40158) (512y: 279) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.593982e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.608272e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.608272e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.220172e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.234094e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.234094e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.169483 sec - 4,001,360,531 cycles # 1.841 GHz - 5,942,704,084 instructions # 1.49 insn per cycle - 2.173913867 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) +TOTAL : 2.282224 sec + 3,899,980,695 cycles # 1.706 GHz + 5,854,430,419 instructions # 1.50 insn per cycle + 2.287473513 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2112) (512y: 142) (512z:39211) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 3babb1df02..5156a1b6a3 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:03:26 +DATE: 2024-01-30_05:38:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.228708e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.253927e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.255914e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.248555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273233e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.276174e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.533504 sec - 2,263,207,731 cycles # 2.940 GHz - 3,466,881,858 instructions # 1.53 insn per cycle - 0.826915946 seconds time elapsed +TOTAL : 0.534739 sec + 2,168,342,582 cycles # 2.838 GHz + 3,393,710,794 instructions # 1.57 insn per cycle + 0.822006178 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.789640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.817455e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.818645e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.787191e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.816239e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.817488e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.275326 sec - 10,593,520,447 cycles # 2.998 GHz - 22,594,076,221 instructions # 2.13 insn per cycle - 3.592953181 seconds time elapsed +TOTAL : 3.282278 sec + 10,172,932,501 cycles # 2.876 GHz + 20,641,658,708 instructions # 2.03 insn per cycle + 3.596374566 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.301382e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.301861e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.301861e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.152053e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.152498e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.152498e+02 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 38.148296 sec - 114,295,626,783 cycles # 2.997 GHz - 145,697,398,014 instructions # 1.27 insn per cycle - 38.152633004 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:22559) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 39.509903 sec + 113,989,864,763 cycles # 2.886 GHz + 141,709,117,860 instructions # 1.24 insn per cycle + 39.515181315 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21615) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198140450E-004 -Relative difference = 2.83729918072716e-07 +Avg ME (F77/C++) = 6.6266731198140461E-004 +Relative difference = 2.8372991790910424e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.078026e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.080354e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.080354e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.077703e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.080226e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.080226e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 5.337092 sec - 15,158,837,201 cycles # 2.838 GHz - 37,762,768,502 instructions # 2.49 insn per cycle - 5.341471006 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 5.341972 sec + 14,900,472,017 cycles # 2.788 GHz + 37,594,155,695 instructions # 2.52 insn per cycle + 5.347186768 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68056) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198141209E-004 -Relative difference = 2.8372990661989057e-07 +Avg ME (F77/C++) = 6.6266731198141220E-004 +Relative difference = 2.837299064562788e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.680771e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.695420e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.695420e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.479123e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.493428e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.493428e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.144872 sec - 6,018,410,000 cycles # 2.801 GHz - 12,896,129,377 instructions # 2.14 insn per cycle - 2.149367936 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) +TOTAL : 2.203003 sec + 5,937,038,542 cycles # 2.690 GHz + 12,831,821,287 instructions # 2.16 insn per cycle + 2.208347742 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:45663) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.099517e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.119694e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.119694e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.959391e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.980227e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.980227e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.811911 sec - 5,092,083,515 cycles # 2.805 GHz - 11,446,810,123 instructions # 2.25 insn per cycle - 1.816340568 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) +TOTAL : 1.840604 sec + 4,989,362,539 cycles # 2.704 GHz + 11,359,801,014 instructions # 2.28 insn per cycle + 1.846082122 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:39855) (512y: 212) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266731198156789E-004 -Relative difference = 2.837296715097453e-07 +Avg ME (F77/C++) = 6.6266731198156778E-004 +Relative difference = 2.837296716733571e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.715726e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.730963e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.730963e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.264695e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.278525e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.278525e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.135403 sec - 3,949,503,348 cycles # 1.847 GHz - 5,897,146,057 instructions # 1.49 insn per cycle - 2.139763572 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) +TOTAL : 2.267781 sec + 3,893,427,498 cycles # 1.714 GHz + 5,843,815,532 instructions # 1.50 insn per cycle + 2.273034135 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1687) (512y: 116) (512z:38946) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index 9acfc6188d..aecab864cd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:37:38 +DATE: 2024-01-30_05:02:06 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.349743e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.400970e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.406599e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.329622e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.381296e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.387810e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.479034 sec - 2,079,658,422 cycles # 2.994 GHz - 3,036,293,369 instructions # 1.46 insn per cycle - 0.774778080 seconds time elapsed +TOTAL : 0.486367 sec + 1,996,254,093 cycles # 2.831 GHz + 2,951,017,935 instructions # 1.48 insn per cycle + 0.792595596 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.518158e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.591813e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.595087e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.619469e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.695026e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.698446e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.721697 sec - 5,967,810,947 cycles # 3.055 GHz - 12,572,582,180 instructions # 2.11 insn per cycle - 2.009764828 seconds time elapsed +TOTAL : 1.718352 sec + 5,604,348,056 cycles # 2.870 GHz + 11,484,891,091 instructions # 2.05 insn per cycle + 2.010002941 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,128 +86,128 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.064583e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.065690e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.065690e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.963446e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.964435e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.964435e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.952623 sec - 24,634,175,599 cycles # 3.097 GHz - 78,128,580,676 instructions # 3.17 insn per cycle - 7.959267887 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.362127 sec + 24,202,873,915 cycles # 2.893 GHz + 75,878,244,924 instructions # 3.14 insn per cycle + 8.372784572 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.270538e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.283405e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.283405e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.122204e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.135618e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.135618e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.264433 sec - 6,463,017,562 cycles # 2.850 GHz - 20,121,780,291 instructions # 3.11 insn per cycle - 2.276060029 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.311249 sec + 6,498,315,380 cycles # 2.806 GHz + 20,115,878,445 instructions # 3.10 insn per cycle + 2.327706318 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.637995e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.644543e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.644543e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.585863e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.592266e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.592266e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.009271 sec - 2,836,657,749 cycles # 2.799 GHz - 6,989,190,074 instructions # 2.46 insn per cycle - 1.021190194 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +TOTAL : 1.042848 sec + 2,820,748,390 cycles # 2.693 GHz + 7,038,277,049 instructions # 2.50 insn per cycle + 1.060611053 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.933245e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.942230e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.942230e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.805764e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814413e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814413e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.856055 sec - 2,489,722,611 cycles # 2.894 GHz - 6,296,795,301 instructions # 2.53 insn per cycle - 0.871292854 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +TOTAL : 0.916917 sec + 2,479,527,909 cycles # 2.691 GHz + 6,280,728,930 instructions # 2.53 insn per cycle + 0.937569165 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.556155e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.561846e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.561846e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.395801e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.400853e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.400853e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.062224 sec - 2,048,930,683 cycles # 1.922 GHz - 3,267,038,899 instructions # 1.59 insn per cycle - 1.073737564 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +TOTAL : 1.183787 sec + 2,037,112,677 cycles # 1.714 GHz + 3,249,000,234 instructions # 1.59 insn per cycle + 1.203517458 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index fa5a863bc1..cfd5bd9f60 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:13:56 +DATE: 2024-01-30_05:49:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.632667e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.322906e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.322906e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.575134e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.304295e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.304295e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.467438 sec - 2,006,072,261 cycles # 2.897 GHz - 2,979,194,260 instructions # 1.49 insn per cycle - 0.749996630 seconds time elapsed +TOTAL : 0.470892 sec + 1,938,590,562 cycles # 2.832 GHz + 2,932,139,577 instructions # 1.51 insn per cycle + 0.742517096 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.176934e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.461971e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.461971e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.189558e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.483327e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.483327e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.909592 sec - 6,340,042,952 cycles # 2.952 GHz - 13,540,665,643 instructions # 2.14 insn per cycle - 2.204543754 seconds time elapsed +TOTAL : 1.911946 sec + 6,179,624,048 cycles # 2.874 GHz + 12,701,880,125 instructions # 2.06 insn per cycle + 2.209063416 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,132 +99,132 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.018583e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.019594e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.019594e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.966573e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.967552e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.967552e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.135657 sec - 24,647,759,447 cycles # 3.029 GHz - 78,134,979,688 instructions # 3.17 insn per cycle - 8.140016435 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.353267 sec + 24,210,307,332 cycles # 2.898 GHz + 75,882,231,103 instructions # 3.13 insn per cycle + 8.358202878 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.102824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.116231e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.116231e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.010932e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.023878e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.023878e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.319512 sec - 6,477,527,484 cycles # 2.794 GHz - 20,133,025,469 instructions # 3.11 insn per cycle - 2.323961953 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.350780 sec + 6,507,988,967 cycles # 2.764 GHz + 20,124,211,431 instructions # 3.09 insn per cycle + 2.355993372 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.662031e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.669314e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.669314e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.585110e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.591932e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.591932e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.997479 sec - 2,842,471,628 cycles # 2.839 GHz - 6,997,876,677 instructions # 2.46 insn per cycle - 1.001720557 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +TOTAL : 1.046222 sec + 2,830,060,229 cycles # 2.694 GHz + 7,047,238,365 instructions # 2.49 insn per cycle + 1.051506977 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.899662e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.908650e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.908650e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.805765e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.814390e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.814390e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.874017 sec - 2,497,188,263 cycles # 2.845 GHz - 6,305,318,806 instructions # 2.52 insn per cycle - 0.878481296 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +TOTAL : 0.919776 sec + 2,488,595,721 cycles # 2.693 GHz + 6,289,461,030 instructions # 2.53 insn per cycle + 0.925186931 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.510965e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.516631e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.516631e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.390787e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.395884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.395884e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.096117 sec - 2,057,326,282 cycles # 1.871 GHz - 3,276,156,329 instructions # 1.59 insn per cycle - 1.100460049 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +TOTAL : 1.191044 sec + 2,045,888,825 cycles # 1.712 GHz + 3,258,286,239 instructions # 1.59 insn per cycle + 1.196330024 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -232,8 +232,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index cde090527d..18818d76f2 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:25:47 +DATE: 2024-01-30_06:02:02 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.341986e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.393716e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.399601e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.319163e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.372298e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.378244e+05 ) sec^-1 MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 -TOTAL : 0.465286 sec - 1,996,862,139 cycles # 2.938 GHz - 2,998,262,282 instructions # 1.50 insn per cycle - 0.737241303 seconds time elapsed +TOTAL : 0.470285 sec + 1,953,187,910 cycles # 2.826 GHz + 2,879,626,230 instructions # 1.47 insn per cycle + 0.750725256 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.562727e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.635286e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.638678e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.571852e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.645768e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.649137e+05 ) sec^-1 MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 -TOTAL : 1.800198 sec - 6,027,791,673 cycles # 2.972 GHz - 12,131,730,184 instructions # 2.01 insn per cycle - 2.091221198 seconds time elapsed +TOTAL : 1.807519 sec + 5,850,952,354 cycles # 2.861 GHz + 11,909,032,858 instructions # 2.04 insn per cycle + 2.113707665 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,128 +86,128 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.988613e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.989604e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.989604e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 8.257430 sec - 24,655,829,370 cycles # 2.985 GHz - 78,128,720,285 instructions # 3.17 insn per cycle - 8.261783085 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.964186e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.965186e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.965186e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 8.362848 sec + 24,219,340,843 cycles # 2.896 GHz + 75,878,803,024 instructions # 3.13 insn per cycle + 8.367752014 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.090825e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.103638e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.103638e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 2.321920 sec - 6,463,785,266 cycles # 2.780 GHz - 20,119,008,709 instructions # 3.11 insn per cycle - 2.326112409 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.106063e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.119817e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.119817e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.208458e-01 +- 3.253446e-01 ) GeV^-4 +TOTAL : 2.317534 sec + 6,502,161,706 cycles # 2.801 GHz + 20,113,148,136 instructions # 3.09 insn per cycle + 2.322610994 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.643399e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.650724e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.650724e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 1.007824 sec - 2,846,106,580 cycles # 2.814 GHz - 6,988,034,221 instructions # 2.46 insn per cycle - 1.012119516 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.586948e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.593562e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.593562e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 +TOTAL : 1.043186 sec + 2,822,730,977 cycles # 2.696 GHz + 7,035,059,102 instructions # 2.49 insn per cycle + 1.048122577 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.842696e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.851591e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.851591e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.900263 sec - 2,494,778,530 cycles # 2.760 GHz - 6,293,767,609 instructions # 2.52 insn per cycle - 0.904816892 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.807119e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.816011e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.816011e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.214979e-01 +- 3.255522e-01 ) GeV^-4 +TOTAL : 0.917444 sec + 2,481,419,746 cycles # 2.693 GHz + 6,275,834,953 instructions # 2.53 insn per cycle + 0.922842065 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.422519e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.427987e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.427987e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.399447e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.404609e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.404609e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 1.162978 sec - 2,054,015,790 cycles # 1.761 GHz - 3,264,160,600 instructions # 1.59 insn per cycle - 1.167204725 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +TOTAL : 1.182503 sec + 2,042,245,375 cycles # 1.722 GHz + 3,246,419,225 instructions # 1.59 insn per cycle + 1.187753193 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 3a3eb3caf7..e0bdb664e1 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:22:26 +DATE: 2024-01-30_05:58:32 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.336723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.390423e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.395725e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.316613e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.368482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.375052e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.463745 sec - 1,978,530,916 cycles # 2.937 GHz - 2,900,092,077 instructions # 1.47 insn per cycle - 0.732414298 seconds time elapsed +TOTAL : 0.466730 sec + 1,919,349,851 cycles # 2.829 GHz + 2,893,848,641 instructions # 1.51 insn per cycle + 0.736177730 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.551039e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.623038e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.626383e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.573214e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.646713e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.650131e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.755684 sec - 5,914,588,617 cycles # 2.977 GHz - 12,946,533,243 instructions # 2.19 insn per cycle - 2.043185741 seconds time elapsed +TOTAL : 1.756106 sec + 5,695,142,145 cycles # 2.868 GHz + 11,326,470,226 instructions # 1.99 insn per cycle + 2.046387591 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,128 +86,128 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.015505e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.016502e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.016502e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.965319e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.966317e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.966317e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.145261 sec - 24,654,200,089 cycles # 3.026 GHz - 78,126,847,348 instructions # 3.17 insn per cycle - 8.149417178 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.353416 sec + 24,206,918,909 cycles # 2.897 GHz + 75,878,282,077 instructions # 3.13 insn per cycle + 8.358253425 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.056381e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.069788e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.069788e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.994720e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007761e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.007761e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.332327 sec - 6,459,567,003 cycles # 2.766 GHz - 20,120,739,754 instructions # 3.11 insn per cycle - 2.336953548 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.353332 sec + 6,524,875,303 cycles # 2.768 GHz + 20,114,868,262 instructions # 3.08 insn per cycle + 2.358279130 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634359e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.641184e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.641184e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.578556e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.585147e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.585147e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.011258 sec - 2,838,489,472 cycles # 2.797 GHz - 6,988,461,226 instructions # 2.46 insn per cycle - 1.015381469 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +TOTAL : 1.047733 sec + 2,820,818,870 cycles # 2.682 GHz + 7,037,506,961 instructions # 2.49 insn per cycle + 1.053002937 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.722994e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.730625e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.730625e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.765542e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.773827e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.773827e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.960124 sec - 2,489,397,951 cycles # 2.584 GHz - 6,296,004,783 instructions # 2.53 insn per cycle - 0.964386434 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +TOTAL : 0.937560 sec + 2,478,872,591 cycles # 2.633 GHz + 6,279,446,291 instructions # 2.53 insn per cycle + 0.942558881 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.483765e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.489132e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.489132e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.394421e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.399630e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.399630e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.113558 sec - 2,047,316,573 cycles # 1.833 GHz - 3,265,815,382 instructions # 1.60 insn per cycle - 1.117668257 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +TOTAL : 1.184942 sec + 2,037,351,256 cycles # 1.714 GHz + 3,247,924,134 instructions # 1.59 insn per cycle + 1.189828303 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 474faabf1c..d4941d3986 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -41,24 +41,24 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:19:08 +DATE: 2024-01-30_05:55:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.719331e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.371583e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.377724e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.730552e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.395791e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.401561e+05 ) sec^-1 MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 -TOTAL : 0.465990 sec - 1,978,008,980 cycles # 2.919 GHz - 2,922,355,478 instructions # 1.48 insn per cycle - 0.734927742 seconds time elapsed +TOTAL : 0.472770 sec + 1,942,039,449 cycles # 2.839 GHz + 2,914,569,721 instructions # 1.50 insn per cycle + 0.744083634 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost @@ -68,17 +68,17 @@ WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.466647e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.638185e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.641583e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.426812e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.621213e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.624728e+05 ) sec^-1 MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 -TOTAL : 1.827648 sec - 6,079,488,382 cycles # 2.960 GHz - 11,657,968,792 instructions # 1.92 insn per cycle - 2.117399258 seconds time elapsed +TOTAL : 1.841459 sec + 5,951,272,102 cycles # 2.874 GHz + 12,317,260,326 instructions # 2.07 insn per cycle + 2.133121829 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -89,128 +89,128 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.995150e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.996157e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.996157e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.960613e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.961562e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.961562e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 8.229536 sec - 24,636,759,314 cycles # 2.993 GHz - 78,130,437,860 instructions # 3.17 insn per cycle - 8.233776212 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.374119 sec + 24,216,955,817 cycles # 2.891 GHz + 75,878,033,044 instructions # 3.13 insn per cycle + 8.378947710 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863266294753E-004 -Relative difference = 4.92840687132121e-08 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274870439686495E-004 +Relative difference = 6.634286759220428e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.265246e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.278746e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.278746e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.136107e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.149132e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.149132e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.265483 sec - 6,468,059,761 cycles # 2.851 GHz - 20,120,655,203 instructions # 3.11 insn per cycle - 2.269729053 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.306790 sec + 6,504,696,579 cycles # 2.815 GHz + 20,114,676,918 instructions # 3.09 insn per cycle + 2.311724672 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13237) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861460025036E-004 -Relative difference = 2.2029847170826283e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.626140e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.632839e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.632839e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.585387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.592052e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.592052e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.016729 sec - 2,837,492,476 cycles # 2.781 GHz - 6,988,268,106 instructions # 2.46 insn per cycle - 1.020920647 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) +TOTAL : 1.043340 sec + 2,821,286,489 cycles # 2.694 GHz + 7,037,435,358 instructions # 2.49 insn per cycle + 1.048505999 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11604) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.871366e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.880395e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.880395e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.743919e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.751789e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.751789e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.884395 sec - 2,487,612,796 cycles # 2.801 GHz - 6,295,453,513 instructions # 2.53 insn per cycle - 0.888549870 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) +TOTAL : 0.949190 sec + 2,568,265,414 cycles # 2.694 GHz + 6,279,620,229 instructions # 2.45 insn per cycle + 0.954345697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10320) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271938174574524E-004 -Relative difference = 2.7544470208782633e-08 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.476534e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.482059e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.482059e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.404393e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.409463e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.409463e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.119059 sec - 2,048,523,522 cycles # 1.825 GHz - 3,265,858,129 instructions # 1.59 insn per cycle - 1.123246679 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) +TOTAL : 1.176805 sec + 2,037,562,738 cycles # 1.726 GHz + 3,247,895,210 instructions # 1.59 insn per cycle + 1.182054069 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2165) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -218,8 +218,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952779373838E-004 -Relative difference = 4.193891735414155e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 8db08c40cb..391ab3d24f 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:38:07 +DATE: 2024-01-30_05:02:36 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.342222e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.395573e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.401305e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.280133e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.331305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.337921e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.475377 sec - 2,071,036,372 cycles # 2.995 GHz - 3,032,826,911 instructions # 1.46 insn per cycle - 0.762514427 seconds time elapsed +TOTAL : 0.487324 sec + 1,984,658,948 cycles # 2.819 GHz + 2,919,152,547 instructions # 1.47 insn per cycle + 0.799038148 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 8.503453e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.577079e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.580179e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.572518e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.647175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.650566e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.721929 sec - 5,978,281,541 cycles # 3.061 GHz - 12,363,091,331 instructions # 2.07 insn per cycle - 2.009961043 seconds time elapsed +TOTAL : 1.731714 sec + 5,664,416,860 cycles # 2.869 GHz + 11,423,818,247 instructions # 2.02 insn per cycle + 2.033192051 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.071210e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.072246e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.072246e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.928583e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.929543e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.929543e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 -TOTAL : 7.926211 sec - 24,590,863,046 cycles # 3.102 GHz - 77,854,642,024 instructions # 3.17 insn per cycle - 7.932789655 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.512607 sec + 24,191,141,745 cycles # 2.843 GHz + 75,807,282,467 instructions # 3.13 insn per cycle + 8.524714483 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3848) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,107 +107,107 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627487e-04 -Avg ME (F77/C++) = 6.6274866268634797E-004 -Relative difference = 5.630135835748959e-08 +Avg ME (F77/C++) = 6.6274870430095556E-004 +Relative difference = 6.489572191632735e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.597544e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.611358e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.611358e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.113368e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.126874e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.126874e+03 ) sec^-1 MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 -TOTAL : 2.166211 sec - 6,421,975,036 cycles # 2.959 GHz - 20,086,123,706 instructions # 3.13 insn per cycle - 2.181924706 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 2.313934 sec + 6,500,918,155 cycles # 2.804 GHz + 20,111,364,543 instructions # 3.09 insn per cycle + 2.332783497 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:13231) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274861465384638E-004 -Relative difference = 2.211071647257023e-08 +Avg ME (C++/C++) = 6.627485e-04 +Avg ME (F77/C++) = 6.6274853360924479E-004 +Relative difference = 5.071191384964548e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.634914e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.641813e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.641813e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.589760e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.596530e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.596530e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 1.012157 sec - 2,920,524,129 cycles # 2.875 GHz - 7,130,900,907 instructions # 2.44 insn per cycle - 1.024052491 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) +TOTAL : 1.040223 sec + 2,815,442,217 cycles # 2.695 GHz + 7,038,519,370 instructions # 2.50 insn per cycle + 1.057514134 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11587) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668077068E-004 -Relative difference = 5.008498817890231e-09 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.776439e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.783925e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.783925e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.751311e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.759469e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.759469e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 -TOTAL : 0.931369 sec - 2,592,272,120 cycles # 2.772 GHz - 6,439,655,252 instructions # 2.48 insn per cycle - 0.940797030 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) +TOTAL : 0.945030 sec + 2,478,506,957 cycles # 2.610 GHz + 6,280,796,881 instructions # 2.53 insn per cycle + 0.988336476 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10302) (512y: 50) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627194e-04 -Avg ME (F77/C++) = 6.6271939668077068E-004 -Relative difference = 5.008498817890231e-09 +Avg ME (C++/C++) = 6.627193e-04 +Avg ME (F77/C++) = 6.6271927529261421E-004 +Relative difference = 3.728182620967159e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.511530e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.517127e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.517127e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.386273e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.391271e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.391271e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 -TOTAL : 1.092797 sec - 2,119,842,752 cycles # 1.933 GHz - 3,428,582,326 instructions # 1.62 insn per cycle - 1.103781056 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2911) (512y: 22) (512z: 9647) +TOTAL : 1.191730 sec + 2,039,311,665 cycles # 1.704 GHz + 3,248,072,614 instructions # 1.59 insn per cycle + 1.208300824 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2140) (512y: 48) (512z: 9219) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 -Avg ME (F77/C++) = 6.6271952032322112E-004 -Relative difference = 3.066639970473621e-08 +Avg ME (F77/C++) = 6.6271952818273971E-004 +Relative difference = 4.252589469696448e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 7e38ef2c7b..77eae3ae9c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:04:36 +DATE: 2024-01-30_05:40:00 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.586198e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.628840e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.633485e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.547321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.587547e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.593359e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.485828 sec - 2,082,184,633 cycles # 2.938 GHz - 3,131,974,897 instructions # 1.50 insn per cycle - 0.768933691 seconds time elapsed +TOTAL : 0.493771 sec + 2,067,778,848 cycles # 2.808 GHz + 3,079,367,454 instructions # 1.49 insn per cycle + 0.793803934 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.739364e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.800371e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.802973e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.730139e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.790957e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.793762e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.856072 sec - 6,264,858,693 cycles # 2.996 GHz - 12,861,475,785 instructions # 2.05 insn per cycle - 2.150989164 seconds time elapsed +TOTAL : 1.861027 sec + 6,035,258,548 cycles # 2.873 GHz + 13,088,532,370 instructions # 2.17 insn per cycle + 2.157681364 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 @@ -86,47 +86,47 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.664072e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.664878e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.664878e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.963182 sec - 87,150,309,684 cycles # 3.009 GHz - 135,629,504,187 instructions # 1.56 insn per cycle - 28.967350767 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15563) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.418156e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.418889e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.418889e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 30.278912 sec + 87,193,893,967 cycles # 2.880 GHz + 133,999,567,781 instructions # 1.54 insn per cycle + 30.284052553 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16123) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275340277317796E-004 -Relative difference = 4.184328521943034e-09 +Avg ME (C++/C++) = 6.627535e-04 +Avg ME (F77/C++) = 6.6275354356437610E-004 +Relative difference = 6.573239683366044e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.045716e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.058481e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.058481e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.336070 sec - 6,777,659,542 cycles # 2.897 GHz - 19,385,758,717 instructions # 2.86 insn per cycle - 2.340385269 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.858617e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.871131e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.871131e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.400232 sec + 6,719,203,240 cycles # 2.795 GHz + 19,163,412,782 instructions # 2.85 insn per cycle + 2.405407499 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -134,80 +134,80 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862707273868E-004 -Relative difference = 4.0849182767952624e-08 +Avg ME (F77/C++) = 6.6274859783433532E-004 +Relative difference = 3.2677016209485094e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.472442e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.477968e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.477968e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.418642e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.423893e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.423893e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.122029 sec - 3,176,939,727 cycles # 2.822 GHz - 6,808,012,735 instructions # 2.14 insn per cycle - 1.126367927 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) +TOTAL : 1.168526 sec + 3,140,858,608 cycles # 2.683 GHz + 6,747,205,943 instructions # 2.15 insn per cycle + 1.173847287 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:48625) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731558747466E-004 -Relative difference = 2.3520194007978538e-08 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724143469353E-004 +Relative difference = 6.252149235286529e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.771353e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.779044e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.779044e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.703185e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.710717e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.710717e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.933958 sec - 2,648,978,339 cycles # 2.826 GHz - 5,986,332,919 instructions # 2.26 insn per cycle - 0.938142599 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) +TOTAL : 0.972109 sec + 2,610,520,883 cycles # 2.675 GHz + 5,931,408,487 instructions # 2.27 insn per cycle + 0.977161465 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:42219) (512y: 24) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731558747466E-004 -Relative difference = 2.3520194007978538e-08 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724143469353E-004 +Relative difference = 6.252149235286529e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.477195e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.482900e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.482900e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.118598 sec - 2,077,867,589 cycles # 1.852 GHz - 3,500,933,885 instructions # 1.68 insn per cycle - 1.122888926 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5197) (512y: 3) (512z:44822) +EvtsPerSec[Rmb+ME] (23) = ( 1.380375e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.385342e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.385342e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.197476 sec + 2,050,152,648 cycles # 1.706 GHz + 3,435,996,672 instructions # 1.68 insn per cycle + 1.202741015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4188) (512y: 9) (512z:44489) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750363879224E-004 -Relative difference = 5.490631193034436e-09 +Avg ME (F77/C++) = 6.6272748295826550E-004 +Relative difference = 2.5714542480216212e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index be288b051a..0e738d355a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_15:05:29 +DATE: 2024-01-30_05:40:55 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.522303e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.562037e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.566663e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.495403e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.535166e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.540654e+05 ) sec^-1 MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 -TOTAL : 0.487689 sec - 2,083,511,350 cycles # 2.934 GHz - 3,089,371,073 instructions # 1.48 insn per cycle - 0.770457246 seconds time elapsed +TOTAL : 0.492553 sec + 2,044,553,803 cycles # 2.834 GHz + 3,023,997,415 instructions # 1.48 insn per cycle + 0.781011529 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.692823e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.753155e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.755861e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.639095e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.697524e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.700186e+05 ) sec^-1 MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 -TOTAL : 1.857687 sec - 6,225,343,404 cycles # 2.971 GHz - 13,475,402,986 instructions # 2.16 insn per cycle - 2.151878134 seconds time elapsed +TOTAL : 1.866900 sec + 6,069,227,607 cycles # 2.871 GHz + 11,631,061,560 instructions # 1.92 insn per cycle + 2.173672620 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 @@ -86,47 +86,47 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.702624e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.703436e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.703436e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 -TOTAL : 28.768124 sec - 86,390,205,800 cycles # 3.004 GHz - 135,908,144,933 instructions # 1.57 insn per cycle - 28.772428929 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15910) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 5.528053e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.528817e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.528817e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059968e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 29.676151 sec + 85,692,453,161 cycles # 2.888 GHz + 134,120,579,675 instructions # 1.57 insn per cycle + 29.681167734 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:16109) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275352674967369E-004 -Relative difference = 4.0361421941458736e-08 +Avg ME (C++/C++) = 6.627536e-04 +Avg ME (F77/C++) = 6.6275357377482830E-004 +Relative difference = 3.95700176737784e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.022155e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.035059e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.035059e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 -TOTAL : 2.343932 sec - 6,861,828,247 cycles # 2.923 GHz - 19,438,837,198 instructions # 2.83 insn per cycle - 2.348614161 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.924333e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.936823e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.936823e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059961e+00 +- 2.367791e+00 ) GeV^-4 +TOTAL : 2.377362 sec + 6,721,293,685 cycles # 2.823 GHz + 19,223,635,236 instructions # 2.86 insn per cycle + 2.382317911 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:68882) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -134,80 +134,80 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274862764021530E-004 -Relative difference = 4.170542995014107e-08 +Avg ME (F77/C++) = 6.6274859765498573E-004 +Relative difference = 3.538316437387639e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.496126e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.501849e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.501849e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.449646e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.455242e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.455242e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 1.104423 sec - 3,132,664,979 cycles # 2.827 GHz - 6,718,780,084 instructions # 2.14 insn per cycle - 1.108980849 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) +TOTAL : 1.140025 sec + 3,079,658,771 cycles # 2.692 GHz + 6,686,222,708 instructions # 2.17 insn per cycle + 1.145080651 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:47416) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731651051409E-004 -Relative difference = 2.4912983202981302e-08 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724133897148E-004 +Relative difference = 6.237705578619894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.776930e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.784946e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.784946e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.717993e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.725785e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.725785e+04 ) sec^-1 MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 -TOTAL : 0.932427 sec - 2,628,346,991 cycles # 2.809 GHz - 5,969,517,035 instructions # 2.27 insn per cycle - 0.937063186 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) +TOTAL : 0.963197 sec + 2,607,305,399 cycles # 2.696 GHz + 5,935,632,787 instructions # 2.28 insn per cycle + 0.968307475 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:41564) (512y: 18) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627273e-04 -Avg ME (F77/C++) = 6.6272731651051409E-004 -Relative difference = 2.4912983202981302e-08 +Avg ME (C++/C++) = 6.627272e-04 +Avg ME (F77/C++) = 6.6272724133897148E-004 +Relative difference = 6.237705578619894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=1] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.476207e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.481770e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.481770e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 -TOTAL : 1.119825 sec - 2,078,177,152 cycles # 1.850 GHz - 3,494,307,916 instructions # 1.68 insn per cycle - 1.124539671 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4161) (512y: 4) (512z:44465) +EvtsPerSec[Rmb+ME] (23) = ( 1.382587e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.387561e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.387561e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060905e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.195178 sec + 2,050,651,524 cycles # 1.710 GHz + 3,422,960,187 instructions # 1.67 insn per cycle + 1.200266882 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3375) (512y: 11) (512z:43966) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 -Avg ME (F77/C++) = 6.6272750384530066E-004 -Relative difference = 5.80223501432476e-09 +Avg ME (F77/C++) = 6.6272749650985591E-004 +Relative difference = 5.26633351741962e-09 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index 5f7b3a9875..7714401e20 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:38:36 +DATE: 2024-01-30_05:03:07 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.457884e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.485326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.487559e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.456900e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.484722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.487399e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.520964 sec - 2,264,629,245 cycles # 3.014 GHz - 3,506,769,468 instructions # 1.55 insn per cycle - 0.829035526 seconds time elapsed +TOTAL : 0.528831 sec + 2,192,645,567 cycles # 2.833 GHz + 3,378,106,633 instructions # 1.54 insn per cycle + 0.861052908 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.121692e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.155469e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.156924e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.113905e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.147620e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.149017e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.040169 sec - 10,113,736,424 cycles # 3.067 GHz - 21,679,010,671 instructions # 2.14 insn per cycle - 3.353783770 seconds time elapsed +TOTAL : 3.049267 sec + 9,507,642,735 cycles # 2.871 GHz + 19,066,132,971 instructions # 2.01 insn per cycle + 3.371164553 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.970949e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.971967e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.971967e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.769606e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.770411e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.770411e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.331676 sec - 25,844,448,081 cycles # 3.101 GHz - 79,435,783,122 instructions # 3.07 insn per cycle - 8.337986155 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.278224 sec + 26,812,823,901 cycles # 2.889 GHz + 82,462,709,559 instructions # 3.08 insn per cycle + 9.289930135 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6623) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.738000e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.741428e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.741428e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.509625e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.512894e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.512894e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.396976 sec - 12,662,050,046 cycles # 2.877 GHz - 38,549,909,567 instructions # 3.04 insn per cycle - 4.410716297 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.686363 sec + 12,638,766,565 cycles # 2.696 GHz + 38,538,047,706 instructions # 3.05 insn per cycle + 4.708715306 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12755) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.572381e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.589867e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.589867e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.005037e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.021640e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.021640e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.922338 sec - 5,514,460,304 cycles # 2.862 GHz - 13,481,024,869 instructions # 2.44 insn per cycle - 1.938160725 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) +TOTAL : 2.058850 sec + 5,538,789,085 cycles # 2.684 GHz + 13,583,257,196 instructions # 2.45 insn per cycle + 2.079297542 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10944) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.241393e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.262450e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.262450e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.175649e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.196938e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.196938e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.784180 sec - 4,876,372,587 cycles # 2.727 GHz - 12,135,890,910 instructions # 2.49 insn per cycle - 1.800822098 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) +TOTAL : 1.797590 sec + 4,843,535,516 cycles # 2.687 GHz + 12,110,039,110 instructions # 2.50 insn per cycle + 1.813279758 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9682) (512y: 76) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.118322e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.130873e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.130873e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.862805e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.874864e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.874864e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.314160 sec - 4,150,527,960 cycles # 1.791 GHz - 6,337,492,716 instructions # 1.53 insn per cycle - 2.327321030 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) +TOTAL : 2.399875 sec + 4,096,013,404 cycles # 1.704 GHz + 6,283,624,620 instructions # 1.53 insn per cycle + 2.418716991 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1528) (512y: 76) (512z: 9010) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index d8b5e539f7..9cdb5ea5b9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2023-11-24_14:39:13 +DATE: 2024-01-30_05:03:46 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.493159e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.521560e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.523711e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.463401e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.491582e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.494105e+05 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 0.520820 sec - 2,242,158,202 cycles # 2.985 GHz - 3,540,678,032 instructions # 1.58 insn per cycle - 0.821178917 seconds time elapsed +TOTAL : 0.528345 sec + 2,191,366,155 cycles # 2.835 GHz + 3,376,981,873 instructions # 1.54 insn per cycle + 0.868249282 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.120115e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.154154e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.155531e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.141311e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.175321e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176779e+05 ) sec^-1 MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 -TOTAL : 3.032729 sec - 10,102,528,341 cycles # 3.074 GHz - 22,682,645,762 instructions # 2.25 insn per cycle - 3.344844132 seconds time elapsed +TOTAL : 3.034775 sec + 9,461,569,572 cycles # 2.871 GHz + 21,570,730,622 instructions # 2.28 insn per cycle + 3.354365055 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.963020e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.963993e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.963993e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.763986e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.764820e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.764820e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 8.365135 sec - 25,986,900,256 cycles # 3.106 GHz - 79,446,545,297 instructions # 3.06 insn per cycle - 8.371739931 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 9.308542 sec + 26,818,191,963 cycles # 2.880 GHz + 82,362,969,124 instructions # 3.07 insn per cycle + 9.331807277 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 6491) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.587282e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.590429e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.590429e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.494755e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.497969e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.497969e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 4.581482 sec - 12,652,204,532 cycles # 2.760 GHz - 38,520,894,837 instructions # 3.04 insn per cycle - 4.597176677 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.702744 sec + 12,651,856,685 cycles # 2.688 GHz + 38,557,643,348 instructions # 3.05 insn per cycle + 4.723006762 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:12729) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.252221e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.267695e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.267695e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.057026e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.073448e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.073448e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.997543 sec - 5,563,118,159 cycles # 2.781 GHz - 13,606,463,107 instructions # 2.45 insn per cycle - 2.009289653 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) +TOTAL : 2.045356 sec + 5,503,322,263 cycles # 2.685 GHz + 13,599,131,001 instructions # 2.47 insn per cycle + 2.065937163 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10926) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.355631e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.376642e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.376642e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.173965e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.195231e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.195231e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 1.762069 sec - 4,918,440,362 cycles # 2.785 GHz - 12,270,891,194 instructions # 2.49 insn per cycle - 1.776725149 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) +TOTAL : 1.797623 sec + 4,836,406,491 cycles # 2.684 GHz + 12,123,840,407 instructions # 2.51 insn per cycle + 1.816744592 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 9659) (512y: 76) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.144291e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.157521e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.157521e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.872297e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.884618e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.884618e+03 ) sec^-1 MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 -TOTAL : 2.305102 sec - 4,151,951,816 cycles # 1.798 GHz - 6,443,598,853 instructions # 1.55 insn per cycle - 2.315841771 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) +TOTAL : 2.396265 sec + 4,088,419,794 cycles # 1.703 GHz + 6,289,480,909 instructions # 1.54 insn per cycle + 2.414194012 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1508) (512y: 76) (512z: 9009) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProces cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 -Avg ME (F77/C++) = 6.6266730409276857E-004 -Relative difference = 2.956342832710188e-07 +Avg ME (F77/C++) = 6.6266730409276836E-004 +Relative difference = 2.9563428359824236e-07 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 8be70208e9..10dc25694a 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:41:33 +DATE: 2024-01-30_05:06:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.061812e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.062202e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.062394e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.064289e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.064686e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.064874e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.449920 sec - 8,136,633,317 cycles # 2.967 GHz - 17,299,512,986 instructions # 2.13 insn per cycle - 2.841850939 seconds time elapsed +TOTAL : 2.459219 sec + 7,914,579,350 cycles # 2.876 GHz + 17,414,362,649 instructions # 2.20 insn per cycle + 2.856648920 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.240506e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.242687e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.242958e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.261836e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.264181e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.264456e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.988007 sec - 13,311,291,301 cycles # 3.086 GHz - 31,141,162,586 instructions # 2.34 insn per cycle - 4.368862794 seconds time elapsed +TOTAL : 4.001930 sec + 12,466,660,301 cycles # 2.881 GHz + 28,598,806,424 instructions # 2.29 insn per cycle + 4.385332309 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.074588e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.074809e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.074809e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.667093e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.667308e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.667308e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.548778 sec - 18,889,945,267 cycles # 2.886 GHz - 53,916,181,025 instructions # 2.85 insn per cycle - 6.559444585 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.887821 sec + 18,997,365,246 cycles # 2.759 GHz + 55,182,817,229 instructions # 2.90 insn per cycle + 6.894930966 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.664916e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.665007e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.665007e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.565125e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.565211e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.565211e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.179365 sec - 9,817,296,962 cycles # 3.087 GHz - 27,093,187,078 instructions # 2.76 insn per cycle - 3.190791457 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.384131 sec + 9,789,568,447 cycles # 2.893 GHz + 27,057,217,068 instructions # 2.76 insn per cycle + 3.398188002 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.619576e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.620013e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.620013e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.331784e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.332213e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.332213e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.467898 sec - 4,254,332,720 cycles # 2.899 GHz - 9,562,072,100 instructions # 2.25 insn per cycle - 1.478868560 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) +TOTAL : 1.593056 sec + 4,251,132,724 cycles # 2.667 GHz + 9,566,982,441 instructions # 2.25 insn per cycle + 1.603318722 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.155197e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.155766e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.155766e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.782288e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.782847e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.782847e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.277897 sec - 3,721,805,443 cycles # 2.907 GHz - 8,486,279,618 instructions # 2.28 insn per cycle - 1.292716754 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) +TOTAL : 1.405349 sec + 3,719,980,949 cycles # 2.646 GHz + 8,451,730,597 instructions # 2.27 insn per cycle + 1.418908281 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.734112e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.734785e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.734785e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.332107e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.332611e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.332611e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.420502 sec - 2,698,172,498 cycles # 1.898 GHz - 4,274,439,269 instructions # 1.58 insn per cycle - 1.431694585 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) +TOTAL : 1.593467 sec + 2,690,971,905 cycles # 1.687 GHz + 4,249,909,932 instructions # 1.58 insn per cycle + 1.609272621 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index cc408356f3..14598d99fd 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_15:14:25 +DATE: 2024-01-30_05:50:15 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059911e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.060821e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060821e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.062580e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.063573e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.063573e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.369780 sec - 8,074,775,034 cycles # 3.005 GHz - 17,556,507,233 instructions # 2.17 insn per cycle - 2.745666195 seconds time elapsed +TOTAL : 2.393250 sec + 7,805,787,223 cycles # 2.878 GHz + 17,759,546,689 instructions # 2.28 insn per cycle + 2.771767839 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.206968e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.239855e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.239855e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.205412e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.241153e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.241153e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.981224 sec - 12,936,091,202 cycles # 3.008 GHz - 29,929,520,175 instructions # 2.31 insn per cycle - 4.359803213 seconds time elapsed +TOTAL : 3.999133 sec + 12,487,046,648 cycles # 2.887 GHz + 29,181,392,973 instructions # 2.34 insn per cycle + 4.379902707 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.880447e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.880664e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.880664e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.924049e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.924280e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.924280e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.704716 sec - 18,908,384,031 cycles # 2.819 GHz - 53,916,851,861 instructions # 2.85 insn per cycle - 6.708873310 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.668799 sec + 18,978,883,548 cycles # 2.845 GHz + 55,181,310,686 instructions # 2.91 insn per cycle + 6.673958990 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44874) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -127,20 +127,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.623567e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.623659e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.623659e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.558442e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558530e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558530e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.257235 sec - 9,753,060,796 cycles # 2.991 GHz - 27,093,339,492 instructions # 2.78 insn per cycle - 3.261809851 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.394020 sec + 9,815,752,501 cycles # 2.889 GHz + 27,056,612,659 instructions # 2.76 insn per cycle + 3.399148950 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97234) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -155,20 +155,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.503641e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.504075e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.504075e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.345002e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345461e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345461e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.512954 sec - 4,263,837,997 cycles # 2.812 GHz - 9,562,348,938 instructions # 2.24 insn per cycle - 1.517455505 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) +TOTAL : 1.587609 sec + 4,248,692,453 cycles # 2.674 GHz + 9,567,437,136 instructions # 2.25 insn per cycle + 1.592590793 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84279) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -183,20 +183,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.030228e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.030821e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.030821e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.873515e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.874138e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.874138e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.317705 sec - 3,745,717,606 cycles # 2.836 GHz - 8,485,950,351 instructions # 2.27 insn per cycle - 1.322163213 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) +TOTAL : 1.369431 sec + 3,692,449,005 cycles # 2.689 GHz + 8,450,968,058 instructions # 2.29 insn per cycle + 1.374284426 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79441) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -211,20 +211,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.640278e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.640943e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.640943e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.369341e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.369854e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.369854e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.457064 sec - 2,701,126,575 cycles # 1.849 GHz - 4,273,675,572 instructions # 1.58 insn per cycle - 1.461485660 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) +TOTAL : 1.574041 sec + 2,686,211,452 cycles # 1.702 GHz + 4,249,274,815 instructions # 1.58 insn per cycle + 1.579137128 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2166) (512y: 90) (512z:78318) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 5875d438d6..869fccfa2f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:42:37 +DATE: 2024-01-30_05:07:21 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.059999e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.060392e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060490e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.062893e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.063296e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.063519e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 2.458037 sec - 8,400,321,652 cycles # 3.063 GHz - 17,432,347,823 instructions # 2.08 insn per cycle - 2.856914697 seconds time elapsed +TOTAL : 2.463696 sec + 7,904,124,830 cycles # 2.867 GHz + 17,962,469,169 instructions # 2.27 insn per cycle + 2.863348110 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 9.242262e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.244491e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.244679e+03 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.275434e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.277655e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.278066e+03 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 4.000355 sec - 13,346,085,971 cycles # 3.087 GHz - 30,375,563,442 instructions # 2.28 insn per cycle - 4.381330754 seconds time elapsed +TOTAL : 4.004406 sec + 12,472,043,203 cycles # 2.872 GHz + 27,476,431,943 instructions # 2.20 insn per cycle + 4.397866058 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.428708e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.428938e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.428938e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.993181e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.993429e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.993429e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.266392 sec - 18,760,509,477 cycles # 2.994 GHz - 53,924,653,614 instructions # 2.87 insn per cycle - 6.272821275 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.616864 sec + 18,937,214,023 cycles # 2.863 GHz + 55,162,675,285 instructions # 2.91 insn per cycle + 6.624084944 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44747) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.667610e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.667700e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.667700e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.560244e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.560337e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.560337e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 3.175383 sec - 9,846,531,791 cycles # 3.102 GHz - 27,090,667,624 instructions # 2.75 insn per cycle - 3.186565682 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.391033 sec + 9,810,909,577 cycles # 2.891 GHz + 27,064,931,751 instructions # 2.76 insn per cycle + 3.404410372 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97230) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.613188e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.613622e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.613622e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.366743e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.367151e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367151e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.470967 sec - 4,269,806,021 cycles # 2.905 GHz - 9,562,364,627 instructions # 2.24 insn per cycle - 1.483852740 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) +TOTAL : 1.577213 sec + 4,241,194,499 cycles # 2.687 GHz + 9,570,392,055 instructions # 2.26 insn per cycle + 1.590680511 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84249) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.136889e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.137474e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.137474e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.823083e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.823621e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.823621e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.283162 sec - 3,736,243,344 cycles # 2.908 GHz - 8,485,942,736 instructions # 2.27 insn per cycle - 1.303492183 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) +TOTAL : 1.389663 sec + 3,742,544,913 cycles # 2.690 GHz + 8,455,558,047 instructions # 2.26 insn per cycle + 1.401942381 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79386) (512y: 90) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.757031e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.757649e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.757649e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.367545e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.368096e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.368096e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 1.412617 sec - 2,696,840,598 cycles # 1.907 GHz - 4,277,582,192 instructions # 1.59 insn per cycle - 1.424037126 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) +TOTAL : 1.578445 sec + 2,686,793,480 cycles # 1.702 GHz + 4,251,847,609 instructions # 1.58 insn per cycle + 1.591347897 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 90) (512z:78289) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index bdc444302c..a75bd83e48 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:43:40 +DATE: 2024-01-30_05:08:27 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.755368e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.756413e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.756816e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.769847e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.770754e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.771164e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.683293 sec - 5,751,994,986 cycles # 2.961 GHz - 11,825,851,354 instructions # 2.06 insn per cycle - 2.048483718 seconds time elapsed +TOTAL : 1.703094 sec + 5,571,181,653 cycles # 2.867 GHz + 11,974,166,174 instructions # 2.15 insn per cycle + 2.057946232 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.349504e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.350297e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.350390e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.318486e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.319261e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319430e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.907896 sec - 6,721,068,256 cycles # 3.077 GHz - 14,203,232,269 instructions # 2.11 insn per cycle - 2.241134413 seconds time elapsed +TOTAL : 1.904733 sec + 6,266,697,659 cycles # 2.868 GHz + 13,596,680,456 instructions # 2.17 insn per cycle + 2.241129899 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.138313e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.138596e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.138596e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.651013e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.651286e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.651286e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.786618 sec - 17,878,554,532 cycles # 3.089 GHz - 53,588,537,371 instructions # 3.00 insn per cycle - 5.793567200 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.115216 sec + 17,580,950,028 cycles # 2.876 GHz + 51,788,424,956 instructions # 2.95 insn per cycle + 6.122234952 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087330436E-003 +Relative difference = 2.119555946686223e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.595673e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.596126e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.596126e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.481350 sec - 4,569,631,683 cycles # 3.086 GHz - 13,762,990,034 instructions # 3.01 insn per cycle - 1.491903170 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.365857e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.366295e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.366295e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.576617 sec + 4,544,162,423 cycles # 2.878 GHz + 13,760,085,205 instructions # 3.03 insn per cycle + 1.587566374 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896527003E-003 -Relative difference = 3.151388282563952e-08 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.220064e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.221950e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.221950e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.739476 sec - 2,140,174,105 cycles # 2.889 GHz - 4,817,678,891 instructions # 2.25 insn per cycle - 0.765666592 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.652038e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.653755e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.653755e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.803941 sec + 2,147,173,176 cycles # 2.667 GHz + 4,827,637,015 instructions # 2.25 insn per cycle + 0.818354401 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.208843e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.211063e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.211063e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.650167 sec - 1,891,036,782 cycles # 2.902 GHz - 4,274,768,942 instructions # 2.26 insn per cycle - 0.663284347 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.264093e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.266084e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.266084e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.735161 sec + 1,890,652,826 cycles # 2.565 GHz + 4,260,215,320 instructions # 2.25 insn per cycle + 0.752160652 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,35 +188,35 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.569007e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.571564e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.571564e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.710143 sec - 1,360,534,299 cycles # 1.926 GHz - 2,159,690,774 instructions # 1.59 insn per cycle - 0.724501882 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) +EvtsPerSec[Rmb+ME] (23) = ( 6.595587e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.597618e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.597618e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.809620 sec + 1,357,631,253 cycles # 1.673 GHz + 2,149,171,041 instructions # 1.58 insn per cycle + 0.843747051 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982958280E-003 -Relative difference = 2.0044092642523172e-08 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index b005f4754c..dd846fe890 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_15:15:29 +DATE: 2024-01-30_05:51:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.768306e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.770186e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.770186e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.783457e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.785575e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.785575e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 -TOTAL : 1.618405 sec - 5,667,284,995 cycles # 2.995 GHz - 11,929,859,554 instructions # 2.11 insn per cycle - 1.951873227 seconds time elapsed +TOTAL : 1.618103 sec + 5,426,392,715 cycles # 2.867 GHz + 11,041,442,286 instructions # 2.03 insn per cycle + 1.951599799 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.327456e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.341794e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.341794e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.306053e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.319762e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.319762e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 -TOTAL : 1.910757 sec - 6,253,041,183 cycles # 2.847 GHz - 13,520,927,317 instructions # 2.16 insn per cycle - 2.256378130 seconds time elapsed +TOTAL : 1.925915 sec + 6,319,647,912 cycles # 2.872 GHz + 13,785,417,374 instructions # 2.18 insn per cycle + 2.259607907 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.875191e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.875509e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.875509e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.635189e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.635467e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.635467e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.953864 sec - 17,981,915,231 cycles # 3.018 GHz - 53,589,180,868 instructions # 2.98 insn per cycle - 5.958154287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.119375 sec + 17,637,027,949 cycles # 2.881 GHz + 51,787,792,256 instructions # 2.94 insn per cycle + 6.124243714 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:27812) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,27 +120,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087541066E-003 -Relative difference = 2.1197698286506752e-08 +Avg ME (F77/C++) = 9.8479612087330436E-003 +Relative difference = 2.119555946686223e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.525468e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.525921e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.525921e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.503844 sec - 4,572,076,788 cycles # 3.033 GHz - 13,762,724,678 instructions # 3.01 insn per cycle - 1.508341488 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.362357e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.362789e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.362789e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.576085 sec + 4,544,551,937 cycles # 2.877 GHz + 13,759,350,934 instructions # 3.03 insn per cycle + 1.581388093 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97762) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,27 +148,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896527003E-003 -Relative difference = 3.151388282563952e-08 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.089959e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.091818e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.091818e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.750792 sec - 2,139,360,697 cycles # 2.835 GHz - 4,817,322,213 instructions # 2.25 insn per cycle - 0.755255088 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.701025e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.702845e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.702845e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.794089 sec + 2,138,661,629 cycles # 2.680 GHz + 4,826,930,405 instructions # 2.26 insn per cycle + 0.798991405 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84831) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,27 +176,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.083345e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.085639e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.085639e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.658894 sec - 1,877,804,985 cycles # 2.834 GHz - 4,274,294,161 instructions # 2.28 insn per cycle - 0.663396758 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.613418e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.615510e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.615510e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.699716 sec + 1,882,009,512 cycles # 2.675 GHz + 4,259,439,384 instructions # 2.26 insn per cycle + 0.704552121 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:80038) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -204,36 +204,36 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070551E-003 -Relative difference = 1.858823877057982e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.030887e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.033160e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.033160e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.757777 sec - 1,359,709,959 cycles # 1.785 GHz - 2,159,410,769 instructions # 1.59 insn per cycle - 0.762416842 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) +EvtsPerSec[Rmb+ME] (23) = ( 6.688489e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.690546e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.690546e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.795848 sec + 1,355,819,871 cycles # 1.696 GHz + 2,148,215,879 instructions # 1.58 insn per cycle + 0.800761416 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2820) (512y: 44) (512z:78510) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982958280E-003 -Relative difference = 2.0044092642523172e-08 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index a6a593e43c..90b9187b98 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:44:27 +DATE: 2024-01-30_05:09:16 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 6.747119e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.748060e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.748340e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.764318e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.765250e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.765666e+02 ) sec^-1 MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 -TOTAL : 1.659570 sec - 5,920,808,418 cycles # 3.063 GHz - 12,044,538,664 instructions # 2.03 insn per cycle - 1.991771483 seconds time elapsed +TOTAL : 1.705167 sec + 5,556,067,435 cycles # 2.852 GHz + 10,985,634,618 instructions # 1.98 insn per cycle + 2.060310498 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.325940e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.326739e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.326842e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.344230e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.345038e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.345205e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 -TOTAL : 1.907449 sec - 6,693,274,890 cycles # 3.061 GHz - 12,994,667,842 instructions # 1.94 insn per cycle - 2.242820084 seconds time elapsed +TOTAL : 1.928465 sec + 6,365,812,176 cycles # 2.870 GHz + 12,742,048,160 instructions # 2.00 insn per cycle + 2.275067290 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 9.023898e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.024201e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.024201e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 8.700294e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.700564e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.700564e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 -TOTAL : 5.861507 sec - 17,865,192,522 cycles # 3.046 GHz - 53,579,122,611 instructions # 3.00 insn per cycle - 5.865565703 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.077926 sec + 17,558,502,709 cycles # 2.889 GHz + 51,759,109,121 instructions # 2.95 insn per cycle + 6.085026833 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:27678) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087582491E-003 -Relative difference = 2.1198118933954545e-08 +Avg ME (F77/C++) = 9.8479612087313262E-003 +Relative difference = 2.1195385077844924e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.621823e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.622258e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.622258e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 -TOTAL : 1.463287 sec - 4,558,245,342 cycles # 3.108 GHz - 13,754,988,539 instructions # 3.02 insn per cycle - 1.467375257 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 3.376771e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.377174e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.377174e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825038e-06 ) GeV^-6 +TOTAL : 1.572289 sec + 4,548,603,521 cycles # 2.891 GHz + 13,758,604,883 instructions # 3.02 insn per cycle + 1.583710945 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:97728) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 -Avg ME (F77/C++) = 9.8479546896225560E-003 -Relative difference = 3.151694379513441e-08 +Avg ME (F77/C++) = 9.8479546894727158E-003 +Relative difference = 3.1532159158088894e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.105620e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.107481e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.107481e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.747732 sec - 2,126,324,616 cycles # 2.830 GHz - 4,818,424,886 instructions # 2.27 insn per cycle - 0.751960174 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 6.592179e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.593820e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.593820e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.808909 sec + 2,140,416,404 cycles # 2.637 GHz + 4,826,824,873 instructions # 2.26 insn per cycle + 0.906681144 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84793) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070967E-003 -Relative difference = 1.8588234562202478e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.303282e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.305517e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.305517e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 -TOTAL : 0.641081 sec - 1,865,331,641 cycles # 2.893 GHz - 4,275,221,232 instructions # 2.29 insn per cycle - 0.645264707 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 7.677326e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.679741e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.679741e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187187e-05 +- 9.826763e-06 ) GeV^-6 +TOTAL : 0.695609 sec + 1,868,752,206 cycles # 2.678 GHz + 4,259,067,854 instructions # 2.28 insn per cycle + 0.708960929 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79978) (512y: 46) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,35 +188,35 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProce cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 -Avg ME (F77/C++) = 9.8929728161070967E-003 -Relative difference = 1.8588234562202478e-08 +Avg ME (F77/C++) = 9.8929728159608508E-003 +Relative difference = 1.8603017364363385e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.530995e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.533488e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.533488e+02 ) sec^-1 -MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 -TOTAL : 0.706631 sec - 1,357,570,001 cycles # 1.912 GHz - 2,164,337,331 instructions # 1.59 insn per cycle - 0.710707193 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3474) (512y: 34) (512z:79492) +EvtsPerSec[Rmb+ME] (23) = ( 6.775075e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.777182e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.777182e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826770e-06 ) GeV^-6 +TOTAL : 0.788693 sec + 1,354,650,321 cycles # 1.715 GHz + 2,148,091,187 instructions # 1.59 insn per cycle + 0.801177717 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2776) (512y: 44) (512z:78501) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 9.892981e-03 -Avg ME (F77/C++) = 9.8929811982955140E-003 -Relative difference = 2.0044060904369713e-08 +Avg ME (C++/C++) = 9.892980e-03 +Avg ME (F77/C++) = 9.8929802670331551E-003 +Relative difference = 2.699218597469717e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 73f6f01a0a..4eda45e114 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:45:14 +DATE: 2024-01-30_05:10:05 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.691784e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.692281e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.692424e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.692959e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.693612e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.693848e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.167507 sec - 7,673,265,360 cycles # 3.083 GHz - 16,847,311,173 instructions # 2.20 insn per cycle - 2.544843357 seconds time elapsed +TOTAL : 2.179406 sec + 7,155,207,889 cycles # 2.861 GHz + 14,615,335,571 instructions # 2.04 insn per cycle + 2.559881855 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.109611e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.109929e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.109960e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.111470e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111782e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111825e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.399118 sec - 11,512,402,509 cycles # 3.091 GHz - 25,362,588,706 instructions # 2.20 insn per cycle - 3.780006761 seconds time elapsed +TOTAL : 3.413893 sec + 10,746,707,284 cycles # 2.875 GHz + 23,674,149,917 instructions # 2.20 insn per cycle + 3.796927749 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 7.998480e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.998694e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.998694e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.884803e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.885022e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.885022e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.606791 sec - 19,173,459,975 cycles # 2.901 GHz - 54,151,769,179 instructions # 2.82 insn per cycle - 6.611030036 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.704107 sec + 19,257,123,030 cycles # 2.874 GHz + 55,394,447,460 instructions # 2.88 insn per cycle + 6.709385430 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44898) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.627384e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.627470e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.627470e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.509946e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.510039e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.510039e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.249018 sec - 9,449,828,715 cycles # 2.906 GHz - 26,158,243,948 instructions # 2.77 insn per cycle - 3.253176983 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.502177 sec + 9,384,694,038 cycles # 2.677 GHz + 25,874,743,625 instructions # 2.76 insn per cycle + 3.507349921 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96804) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.818750e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.819217e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.819217e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.557555e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.558062e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.558062e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.387571 sec - 4,051,003,122 cycles # 2.912 GHz - 9,227,088,631 instructions # 2.28 insn per cycle - 1.391733244 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) +TOTAL : 1.490188 sec + 4,000,749,453 cycles # 2.678 GHz + 9,119,038,902 instructions # 2.28 insn per cycle + 1.495279789 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83820) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.365519e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.366141e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.366141e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.057405e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.058069e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.058069e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.214524 sec - 3,540,300,515 cycles # 2.907 GHz - 8,174,532,807 instructions # 2.31 insn per cycle - 1.218602300 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) +TOTAL : 1.307627 sec + 3,513,640,690 cycles # 2.679 GHz + 8,029,011,845 instructions # 2.29 insn per cycle + 1.312711431 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79028) (512y: 70) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.824767e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.825436e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.825436e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.350506e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.351010e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.351010e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.386311 sec - 2,658,660,002 cycles # 1.913 GHz - 4,154,162,009 instructions # 1.56 insn per cycle - 1.390371345 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) +TOTAL : 1.581908 sec + 2,606,864,065 cycles # 1.673 GHz + 4,077,382,976 instructions # 1.56 insn per cycle + 1.587144818 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1903) (512y: 70) (512z:78042) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index d4fe0b979f..328b61834e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2023-11-24_14:46:15 +DATE: 2024-01-30_05:11:08 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 4.682422e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.682989e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.683143e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.684370e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.684951e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.685153e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 2.171969 sec - 7,673,087,653 cycles # 3.078 GHz - 17,236,869,851 instructions # 2.25 insn per cycle - 2.550325638 seconds time elapsed +TOTAL : 2.181079 sec + 7,148,088,261 cycles # 2.853 GHz + 14,239,530,947 instructions # 1.99 insn per cycle + 2.562146879 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 1.106682e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.106997e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.107029e+04 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.111591e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.111914e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.111956e+04 ) sec^-1 MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 -TOTAL : 3.401616 sec - 11,548,737,077 cycles # 3.091 GHz - 25,085,297,801 instructions # 2.17 insn per cycle - 3.792911760 seconds time elapsed +TOTAL : 3.413150 sec + 10,755,861,454 cycles # 2.876 GHz + 23,518,245,564 instructions # 2.19 insn per cycle + 3.796500341 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 8.474149e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.474398e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.474398e+01 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.912565e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.912803e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.912803e+01 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 -TOTAL : 6.238723 sec - 19,082,506,717 cycles # 3.058 GHz - 54,152,929,714 instructions # 2.84 insn per cycle - 6.242654091 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 6.680088 sec + 19,228,329,737 cycles # 2.877 GHz + 55,419,296,273 instructions # 2.88 insn per cycle + 6.685533383 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:44806) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.640779e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.640877e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.640877e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.515454e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.515537e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.515537e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 3.222880 sec - 9,391,551,742 cycles # 2.911 GHz - 26,077,333,462 instructions # 2.78 insn per cycle - 3.227016085 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 3.490021 sec + 9,348,051,078 cycles # 2.676 GHz + 25,823,110,897 instructions # 2.76 insn per cycle + 3.495053121 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:96765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.773302e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.773766e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.773766e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.556805e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.557285e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.557285e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.403808 sec - 4,073,842,959 cycles # 2.895 GHz - 9,213,198,457 instructions # 2.26 insn per cycle - 1.407971002 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) +TOTAL : 1.490221 sec + 4,003,060,439 cycles # 2.680 GHz + 9,098,942,911 instructions # 2.27 insn per cycle + 1.495311791 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:83378) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.338283e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.338969e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.338969e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.083203e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.083821e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.083821e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.222714 sec - 3,541,596,074 cycles # 2.889 GHz - 8,167,599,536 instructions # 2.31 insn per cycle - 1.226930952 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) +TOTAL : 1.299137 sec + 3,488,850,980 cycles # 2.678 GHz + 8,010,474,997 instructions # 2.30 insn per cycle + 1.304443015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2:78540) (512y: 70) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.888294e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.888912e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.888912e+02 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.440905e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.441442e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.441442e+02 ) sec^-1 MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 -TOTAL : 1.363883 sec - 2,618,199,272 cycles # 1.915 GHz - 4,152,923,340 instructions # 1.59 insn per cycle - 1.368029549 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) +TOTAL : 1.541232 sec + 2,598,862,718 cycles # 1.682 GHz + 4,064,975,706 instructions # 1.56 insn per cycle + 1.546247038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1420) (512y: 70) (512z:78026) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 293e7e906a..5667ce458e 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:39:50 +DATE: 2024-01-30_05:04:25 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.679868e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.317378e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.685862e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.650880e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.304183e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.677107e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.442326 sec - 1,985,114,674 cycles # 3.012 GHz - 2,801,190,832 instructions # 1.41 insn per cycle - 0.731797225 seconds time elapsed +TOTAL : 0.453043 sec + 1,889,864,608 cycles # 2.824 GHz + 2,684,689,341 instructions # 1.42 insn per cycle + 0.749142975 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.245024e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.095249e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.522546e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.266493e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.111955e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.526543e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.523645 sec - 2,298,323,986 cycles # 3.021 GHz - 3,281,368,879 instructions # 1.43 insn per cycle - 0.818048829 seconds time elapsed +TOTAL : 0.538455 sec + 2,216,644,376 cycles # 2.828 GHz + 3,102,394,165 instructions # 1.40 insn per cycle + 0.841378524 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.100295e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.122827e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.122827e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.822300e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.003024e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.003024e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.510825 sec - 4,703,874,541 cycles # 3.106 GHz - 13,462,408,396 instructions # 2.86 insn per cycle - 1.520640560 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.695633 sec + 4,892,910,077 cycles # 2.883 GHz + 13,801,787,359 instructions # 2.82 insn per cycle + 1.705964185 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.958580e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.032826e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.032826e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.896648e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.972375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.972375e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.858107 sec - 2,618,495,660 cycles # 3.035 GHz - 7,553,259,233 instructions # 2.88 insn per cycle - 0.870670449 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.887238 sec + 2,571,261,116 cycles # 2.883 GHz + 7,401,200,610 instructions # 2.88 insn per cycle + 0.906229412 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.413914e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.637783e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.637783e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.154928e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.367723e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.367723e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.501429 sec - 1,476,425,844 cycles # 2.921 GHz - 3,120,626,738 instructions # 2.11 insn per cycle - 0.516624124 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) +TOTAL : 0.543357 sec + 1,480,133,709 cycles # 2.701 GHz + 3,136,765,286 instructions # 2.12 insn per cycle + 0.561297241 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.779052e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.050087e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.050087e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.571891e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.844626e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.844626e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.454903 sec - 1,341,062,737 cycles # 2.922 GHz - 2,982,346,892 instructions # 2.22 insn per cycle - 0.471560702 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) +TOTAL : 0.482249 sec + 1,314,348,676 cycles # 2.699 GHz + 2,923,288,921 instructions # 2.22 insn per cycle + 0.498803372 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.570252e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.695598e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.695598e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.408041e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.532332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.532332e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.661079 sec - 1,328,110,043 cycles # 1.997 GHz - 1,954,412,178 instructions # 1.47 insn per cycle - 0.674908188 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) +TOTAL : 0.706292 sec + 1,273,944,985 cycles # 1.792 GHz + 1,900,262,296 instructions # 1.49 insn per cycle + 0.723222352 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index d85e5ad544..7b59743406 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_15:12:44 +DATE: 2024-01-30_05:48:29 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.546178e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.153075e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.153075e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.408359e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.101986e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.101986e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.474399 sec - 1,996,543,423 cycles # 2.914 GHz - 2,976,710,186 instructions # 1.49 insn per cycle - 0.743100653 seconds time elapsed +TOTAL : 0.481872 sec + 1,962,034,459 cycles # 2.824 GHz + 2,925,170,965 instructions # 1.49 insn per cycle + 0.753942373 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.225202e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.276966e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.276966e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.119182e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.257748e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.257748e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.757945 sec - 2,951,779,199 cycles # 2.926 GHz - 4,535,994,570 instructions # 1.54 insn per cycle - 1.067442306 seconds time elapsed +TOTAL : 0.770186 sec + 2,924,566,680 cycles # 2.837 GHz + 4,475,846,392 instructions # 1.53 insn per cycle + 1.089161093 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.063685e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.086099e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.086099e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.824024e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.002926e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002926e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.568160 sec - 4,739,058,277 cycles # 3.015 GHz - 13,467,544,042 instructions # 2.84 insn per cycle - 1.572567564 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.698586 sec + 4,927,814,709 cycles # 2.894 GHz + 13,806,118,322 instructions # 2.80 insn per cycle + 1.704123738 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1166) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,27 +120,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.915883e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.989942e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.989942e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.886173e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.963508e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.963508e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.884700 sec - 2,669,440,783 cycles # 3.003 GHz - 7,602,375,116 instructions # 2.85 insn per cycle - 0.889666740 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.900867 sec + 2,618,017,951 cycles # 2.892 GHz + 7,450,102,141 instructions # 2.85 insn per cycle + 0.906367581 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2895) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,27 +148,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.224315e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.444185e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.444185e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.122916e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.345144e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.345144e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.539421 sec - 1,532,896,663 cycles # 2.819 GHz - 3,168,696,255 instructions # 2.07 insn per cycle - 0.544640536 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) +TOTAL : 0.557525 sec + 1,528,674,468 cycles # 2.721 GHz + 3,187,083,360 instructions # 2.08 insn per cycle + 0.563020024 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2890) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -183,20 +183,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.619993e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.881959e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.881959e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.528840e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.810605e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.810605e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.481010 sec - 1,371,011,914 cycles # 2.828 GHz - 3,030,736,025 instructions # 2.21 insn per cycle - 0.485465230 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) +TOTAL : 0.496872 sec + 1,359,999,193 cycles # 2.712 GHz + 2,973,904,476 instructions # 2.19 insn per cycle + 0.502643224 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2543) (512y: 93) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -211,20 +211,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.415824e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.530854e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.530854e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.332416e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.457397e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.457397e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.708748 sec - 1,355,688,416 cycles # 1.903 GHz - 1,991,539,116 instructions # 1.47 insn per cycle - 0.713346181 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) +TOTAL : 0.738182 sec + 1,327,509,915 cycles # 1.788 GHz + 1,939,124,841 instructions # 1.46 insn per cycle + 0.743808066 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1135) (512y: 62) (512z: 2165) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 46adce5ba7..4deacb88f2 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:40:07 +DATE: 2024-01-30_05:04:44 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.664367e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.214901e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.575772e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.642894e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.200887e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.567165e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.441294 sec - 1,975,560,580 cycles # 3.010 GHz - 2,812,494,086 instructions # 1.42 insn per cycle - 0.721240835 seconds time elapsed +TOTAL : 0.451244 sec + 1,883,873,657 cycles # 2.821 GHz + 2,671,262,226 instructions # 1.42 insn per cycle + 0.747348766 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.215656e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.999841e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.407602e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.228371e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.990649e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.395918e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.523997 sec - 2,293,718,680 cycles # 3.009 GHz - 3,268,283,620 instructions # 1.42 insn per cycle - 0.819195532 seconds time elapsed +TOTAL : 0.540844 sec + 2,218,030,903 cycles # 2.829 GHz + 3,154,136,532 instructions # 1.42 insn per cycle + 0.843504278 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.074061e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.096252e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.096252e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.831536e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.003712e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.003712e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.547162 sec - 4,714,230,759 cycles # 3.040 GHz - 13,457,176,123 instructions # 2.85 insn per cycle - 1.553898636 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.690067 sec + 4,884,610,591 cycles # 2.883 GHz + 13,807,943,276 instructions # 2.83 insn per cycle + 1.700194727 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1161) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499484 -Relative difference = 5.286896509487005e-07 +Avg ME (F77/C++) = 0.14247482467499481 +Relative difference = 5.286896511435107e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.993758e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.068724e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.068724e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.876876e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953061e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.842619 sec - 2,622,041,010 cycles # 3.096 GHz - 7,552,388,899 instructions # 2.88 insn per cycle - 0.855874832 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.896918 sec + 2,573,000,483 cycles # 2.854 GHz + 7,407,132,972 instructions # 2.88 insn per cycle + 0.971480588 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2892) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247482467499478 -Relative difference = 5.28689651338321e-07 +Avg ME (F77/C++) = 0.14247482467499475 +Relative difference = 5.286896515331313e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.386860e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.608787e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.608787e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.133331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.344053e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.344053e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.505002 sec - 1,482,553,157 cycles # 2.917 GHz - 3,119,022,546 instructions # 2.10 insn per cycle - 0.516023532 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) +TOTAL : 0.546739 sec + 1,486,856,812 cycles # 2.696 GHz + 3,137,676,944 instructions # 2.11 insn per cycle + 0.563341736 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2875) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.786208e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.060798e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.060798e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.567673e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.839669e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.839669e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.453959 sec - 1,338,340,378 cycles # 2.921 GHz - 2,979,731,487 instructions # 2.23 insn per cycle - 0.465473907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) +TOTAL : 0.482732 sec + 1,314,507,412 cycles # 2.697 GHz + 2,925,746,939 instructions # 2.23 insn per cycle + 0.501062508 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2527) (512y: 93) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.592500e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.720014e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.720014e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.394430e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.516439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.516439e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.655376 sec - 1,326,766,373 cycles # 2.012 GHz - 1,952,310,196 instructions # 1.47 insn per cycle - 0.669844706 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) +TOTAL : 0.710071 sec + 1,273,890,672 cycles # 1.782 GHz + 1,899,944,131 instructions # 1.49 insn per cycle + 0.727352268 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1118) (512y: 62) (512z: 2165) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 271711c5ee..1362a87ac8 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:40:25 +DATE: 2024-01-30_05:05:03 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.342241e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.209752e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.345410e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.327203e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.210086e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.349272e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.439415 sec - 1,959,210,302 cycles # 2.998 GHz - 2,778,383,612 instructions # 1.42 insn per cycle - 0.742350357 seconds time elapsed +TOTAL : 0.446159 sec + 1,908,363,704 cycles # 2.829 GHz + 2,678,040,252 instructions # 1.40 insn per cycle + 0.749417997 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.198954e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.802596e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.953593e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.267889e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.817352e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969269e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.476253 sec - 2,095,785,186 cycles # 2.994 GHz - 2,918,085,144 instructions # 1.39 insn per cycle - 0.759290235 seconds time elapsed +TOTAL : 0.483640 sec + 2,013,701,507 cycles # 2.833 GHz + 2,869,047,503 instructions # 1.42 insn per cycle + 0.770237631 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.156035e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.182076e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.182076e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.437672 sec - 4,454,268,487 cycles # 3.090 GHz - 13,048,356,445 instructions # 2.93 insn per cycle - 1.444257610 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.109983e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136218e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.136218e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.498447 sec + 4,345,988,139 cycles # 2.893 GHz + 12,596,967,872 instructions # 2.90 insn per cycle + 1.511882134 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.036696e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.233511e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.233511e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.116392e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.330955e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.330955e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.559663 sec - 1,702,113,504 cycles # 3.019 GHz - 4,513,068,480 instructions # 2.65 insn per cycle - 0.574904754 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.547469 sec + 1,595,191,710 cycles # 2.889 GHz + 4,246,785,925 instructions # 2.66 insn per cycle + 0.566121323 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.119769e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.899660e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.899660e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.705122e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.431194e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.431194e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287512 sec - 851,994,706 cycles # 2.923 GHz - 1,897,184,726 instructions # 2.23 insn per cycle - 0.299631508 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) +TOTAL : 0.309154 sec + 853,106,357 cycles # 2.719 GHz + 1,916,236,758 instructions # 2.25 insn per cycle + 0.322202646 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.144841e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.980131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.980131e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.291153e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.186493e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.186493e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287863 sec - 803,737,999 cycles # 2.749 GHz - 1,820,191,781 instructions # 2.26 insn per cycle - 0.309536059 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) +TOTAL : 0.282114 sec + 781,605,305 cycles # 2.726 GHz + 1,797,850,243 instructions # 2.30 insn per cycle + 0.301017972 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.909283e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.426697e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.426697e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.544342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.998908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.998908e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.356091 sec - 735,639,321 cycles # 2.041 GHz - 1,305,394,838 instructions # 1.77 insn per cycle - 0.370547154 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) +TOTAL : 0.384301 sec + 720,859,118 cycles # 1.854 GHz + 1,288,039,773 instructions # 1.79 insn per cycle + 0.402338897 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index 9afdfb410c..8cb59221d4 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -41,7 +41,7 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_15:13:02 +DATE: 2024-01-30_05:48:48 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= @@ -51,17 +51,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.454543e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.985559e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.985559e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.444132e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.000397e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.000397e+07 ) sec^-1 MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 -TOTAL : 0.454607 sec - 1,923,330,877 cycles # 2.893 GHz - 2,869,826,397 instructions # 1.49 insn per cycle - 0.721979990 seconds time elapsed +TOTAL : 0.460393 sec + 1,902,959,760 cycles # 2.835 GHz + 2,813,040,217 instructions # 1.48 insn per cycle + 0.731144371 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost @@ -77,17 +77,17 @@ WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.025147e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.561466e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.561466e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.962207e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.533260e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.533260e+07 ) sec^-1 MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 -TOTAL : 0.624986 sec - 2,513,731,167 cycles # 2.923 GHz - 3,800,964,888 instructions # 1.51 insn per cycle - 0.917368993 seconds time elapsed +TOTAL : 0.631701 sec + 2,471,933,418 cycles # 2.836 GHz + 3,725,494,141 instructions # 1.51 insn per cycle + 0.929474422 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 @@ -99,20 +99,20 @@ OK (relative difference <= 5E-3) runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.088101e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112561e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112561e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.529985 sec - 4,473,562,903 cycles # 2.917 GHz - 13,053,120,792 instructions # 2.92 insn per cycle - 1.534351278 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.095228e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.121318e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.121318e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.522701 sec + 4,367,827,655 cycles # 2.862 GHz + 12,601,331,452 instructions # 2.89 insn per cycle + 1.527862957 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 773) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -120,27 +120,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.988941e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.180615e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.180615e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.075499e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.292736e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.292736e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.573010 sec - 1,718,819,301 cycles # 2.980 GHz - 4,560,439,401 instructions # 2.65 insn per cycle - 0.577426659 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.559885 sec + 1,623,222,211 cycles # 2.878 GHz + 4,293,732,841 instructions # 2.65 insn per cycle + 0.565168184 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3265) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -148,27 +148,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 5.851561e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.587029e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.587029e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.618798e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.338072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.338072e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.304354 sec - 870,497,984 cycles # 2.826 GHz - 1,933,292,389 instructions # 2.22 insn per cycle - 0.308728297 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) +TOTAL : 0.317968 sec + 874,954,516 cycles # 2.715 GHz + 1,952,010,632 instructions # 2.23 insn per cycle + 0.323135602 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3488) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -176,27 +176,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.204206e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.044292e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.044292e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.140069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.015278e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.015278e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.288758 sec - 819,562,722 cycles # 2.802 GHz - 1,856,394,583 instructions # 2.27 insn per cycle - 0.293079680 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) +TOTAL : 0.296217 sec + 805,080,990 cycles # 2.697 GHz + 1,834,280,964 instructions # 2.28 insn per cycle + 0.301462842 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3186) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -204,27 +204,27 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.631074e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.091279e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.091279e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.472935e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.920053e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.920053e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.380781 sec - 752,756,759 cycles # 1.958 GHz - 1,346,012,282 instructions # 1.79 insn per cycle - 0.385182072 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) +TOTAL : 0.395002 sec + 745,120,207 cycles # 1.866 GHz + 1,329,072,598 instructions # 1.78 insn per cycle + 0.400211929 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1730) (512y: 24) (512z: 2387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -232,8 +232,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 8032039f3c..a71ead3e03 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:40:42 +DATE: 2024-01-30_05:05:20 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 5.261838e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.178248e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.310022e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.328749e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.215965e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.352409e+08 ) sec^-1 MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 -TOTAL : 0.437409 sec - 1,963,259,812 cycles # 3.015 GHz - 2,775,580,273 instructions # 1.41 insn per cycle - 0.717773083 seconds time elapsed +TOTAL : 0.447516 sec + 1,904,038,107 cycles # 2.819 GHz + 2,679,740,960 instructions # 1.41 insn per cycle + 0.754557698 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 7.175724e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.774857e+08 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.916629e+08 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 7.182679e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.774687e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.914662e+08 ) sec^-1 MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 -TOTAL : 0.475371 sec - 2,078,846,085 cycles # 2.960 GHz - 2,876,831,626 instructions # 1.38 insn per cycle - 0.759711014 seconds time elapsed +TOTAL : 0.483893 sec + 2,007,596,287 cycles # 2.824 GHz + 2,863,986,921 instructions # 1.43 insn per cycle + 0.770182944 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.164325e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.190669e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.190669e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 -TOTAL : 1.426803 sec - 4,442,867,476 cycles # 3.106 GHz - 13,028,535,311 instructions # 2.93 insn per cycle - 1.433195231 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) +EvtsPerSec[Rmb+ME] (23) = ( 1.104449e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.131163e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.131163e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 1.505306 sec + 4,350,737,729 cycles # 2.883 GHz + 12,588,700,465 instructions # 2.89 insn per cycle + 1.517040580 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 759) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -107,26 +107,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246857540270419 -Relative difference = 1.7265064590569047e-07 +Avg ME (F77/C++) = 0.14246860569653919 +Relative difference = 3.998452420257791e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.121532e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.320092e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.320092e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.107801e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.322563e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.322563e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 -TOTAL : 0.543798 sec - 1,695,653,127 cycles # 3.095 GHz - 4,509,334,487 instructions # 2.66 insn per cycle - 0.556325987 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.548678 sec + 1,589,053,041 cycles # 2.872 GHz + 4,241,478,972 instructions # 2.67 insn per cycle + 0.565533397 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3248) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -134,26 +134,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 -Avg ME (F77/C++) = 0.14246859631675157 -Relative difference = 2.5853054135974944e-08 +Avg ME (F77/C++) = 0.14246860808920836 +Relative difference = 5.677888572434963e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.122817e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.890273e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.890273e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 5.682195e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.406347e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.406347e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.287226 sec - 852,393,433 cycles # 2.926 GHz - 1,893,938,143 instructions # 2.22 insn per cycle - 0.301768010 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) +TOTAL : 0.310286 sec + 851,032,417 cycles # 2.702 GHz + 1,913,907,734 instructions # 2.25 insn per cycle + 0.327654627 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3463) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -161,26 +161,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 6.582926e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.478799e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.478799e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 6.251030e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.131063e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.131063e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.268312 sec - 800,191,089 cycles # 2.937 GHz - 1,816,120,065 instructions # 2.27 insn per cycle - 0.281438188 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) +TOTAL : 0.283621 sec + 779,432,148 cycles # 2.704 GHz + 1,795,928,128 instructions # 2.30 insn per cycle + 0.301196370 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3164) (512y: 15) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -188,26 +188,26 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489318272599 -Relative difference = 4.784894739577799e-08 +Avg ME (F77/C++) = 0.14247490815036912 +Relative difference = 5.7205649062398515e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 4.957580e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.456912e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.456912e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 4.530328e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.979352e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.979352e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 -TOTAL : 0.351938 sec - 734,061,618 cycles # 2.062 GHz - 1,302,951,487 instructions # 1.77 insn per cycle - 0.362652515 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 32) (512z: 2383) +TOTAL : 0.386557 sec + 722,333,254 cycles # 1.844 GHz + 1,287,373,146 instructions # 1.78 insn per cycle + 0.407217093 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1709) (512y: 24) (512z: 2387) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -215,8 +215,8 @@ runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcess cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 -Avg ME (F77/C++) = 0.14247489383243206 -Relative difference = 4.32888033512879e-08 +Avg ME (F77/C++) = 0.14247490450137867 +Relative difference = 3.159418737238044e-08 OK (relative difference <= 5E-3) ========================================================================= diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 1f5cde87e5..3f17b073e2 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:40:58 +DATE: 2024-01-30_05:05:38 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.593637e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.233261e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.638999e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.696364e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.334716e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.710197e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.449003 sec - 1,949,950,502 cycles # 2.943 GHz - 2,732,368,699 instructions # 1.40 insn per cycle - 0.743857131 seconds time elapsed +TOTAL : 0.455743 sec + 1,899,569,009 cycles # 2.822 GHz + 2,690,270,670 instructions # 1.42 insn per cycle + 0.752301124 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.250012e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.124195e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.543411e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.256330e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.134663e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.562668e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.532370 sec - 2,306,545,413 cycles # 3.002 GHz - 3,257,931,207 instructions # 1.41 insn per cycle - 0.827800812 seconds time elapsed +TOTAL : 0.544746 sec + 2,203,075,600 cycles # 2.810 GHz + 3,150,811,707 instructions # 1.43 insn per cycle + 0.843284004 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.090450e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112908e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112908e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.796791e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.000139e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.000139e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.524359 sec - 4,736,940,349 cycles # 3.101 GHz - 13,465,176,292 instructions # 2.84 insn per cycle - 1.531673516 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.696226 sec + 4,903,205,903 cycles # 2.884 GHz + 13,824,553,372 instructions # 2.82 insn per cycle + 1.707005330 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1135) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.007098e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.083721e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.083721e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.870381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.944831e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.944831e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.837584 sec - 2,604,823,117 cycles # 3.095 GHz - 7,385,606,203 instructions # 2.84 insn per cycle - 0.848813750 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.899365 sec + 2,603,553,029 cycles # 2.880 GHz + 7,349,607,266 instructions # 2.82 insn per cycle + 0.916195330 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2967) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.452149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.684416e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.684416e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.167537e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.382178e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.382178e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.496103 sec - 1,465,319,733 cycles # 2.929 GHz - 3,056,071,451 instructions # 2.09 insn per cycle - 0.510799326 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) +TOTAL : 0.541013 sec + 1,471,630,021 cycles # 2.697 GHz + 3,084,577,547 instructions # 2.10 insn per cycle + 0.558891839 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3008) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.858923e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.141930e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.141930e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.661938e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.948590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.948590e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.446088 sec - 1,306,981,940 cycles # 2.902 GHz - 2,931,113,574 instructions # 2.24 insn per cycle - 0.461243420 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) +TOTAL : 0.471097 sec + 1,285,426,170 cycles # 2.700 GHz + 2,873,286,331 instructions # 2.24 insn per cycle + 0.489244149 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2653) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.504343e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.619127e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.619127e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.322096e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.437722e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.437722e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.677792 sec - 1,365,651,157 cycles # 2.003 GHz - 1,970,379,541 instructions # 1.44 insn per cycle - 0.689595363 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) +TOTAL : 0.731479 sec + 1,311,962,532 cycles # 1.782 GHz + 1,915,335,630 instructions # 1.46 insn per cycle + 0.746286183 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 70) (512z: 2164) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index a8d85dd2f3..7294ddea09 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -41,23 +41,23 @@ CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2023-11-24_14:41:16 +DATE: 2024-01-30_05:05:56 On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 2.654184e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.189821e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.540640e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.635631e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.151573e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.502163e+07 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444065 sec - 2,018,867,268 cycles # 3.015 GHz - 2,835,456,943 instructions # 1.40 insn per cycle - 0.738016347 seconds time elapsed +TOTAL : 0.454560 sec + 1,887,319,720 cycles # 2.810 GHz + 2,686,521,155 instructions # 1.42 insn per cycle + 0.777570467 seconds time elapsed runNcu /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 @@ -65,17 +65,17 @@ WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions ......................................................................... runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 12.1.0)] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) -EvtsPerSec[Rmb+ME] (23) = ( 3.223829e+07 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.984831e+07 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.384004e+07 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.262333e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.007147e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.410963e+07 ) sec^-1 MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 -TOTAL : 0.536961 sec - 2,246,742,038 cycles # 2.906 GHz - 3,169,759,663 instructions # 1.41 insn per cycle - 0.832088924 seconds time elapsed +TOTAL : 0.538673 sec + 2,205,224,099 cycles # 2.822 GHz + 3,150,366,927 instructions # 1.43 insn per cycle + 0.838863876 seconds time elapsed ------------------------------------------------------------------------- cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 cmpExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 @@ -86,20 +86,20 @@ OK (relative difference <= 5E-3) ========================================================================= runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 1.089936e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.112348e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.112348e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 9.769998e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.971532e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.971532e+04 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 1.524608 sec - 4,732,046,885 cycles # 3.096 GHz - 13,451,186,768 instructions # 2.84 insn per cycle - 1.531974912 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 1.700454 sec + 4,910,062,395 cycles # 2.880 GHz + 13,831,764,171 instructions # 2.82 insn per cycle + 1.712052278 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1130) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -113,20 +113,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.003818e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.078884e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.078884e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 1.857842e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.932046e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.932046e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.838343 sec - 2,601,051,226 cycles # 3.087 GHz - 7,389,258,616 instructions # 2.84 insn per cycle - 0.853180948 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 0.905414 sec + 2,615,099,772 cycles # 2.873 GHz + 7,353,136,311 instructions # 2.81 insn per cycle + 0.925236073 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 2957) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -140,20 +140,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.440885e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.665503e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.665503e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.160999e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.374264e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.374264e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.497494 sec - 1,467,074,103 cycles # 2.924 GHz - 3,056,319,919 instructions # 2.08 insn per cycle - 0.512504484 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) +TOTAL : 0.541919 sec + 1,475,084,747 cycles # 2.698 GHz + 3,084,915,220 instructions # 2.09 insn per cycle + 0.559487031 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2986) (512y: 0) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -167,20 +167,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 3.870867e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.157659e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.157659e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 3.676411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.967587e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.967587e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.444644 sec - 1,306,078,783 cycles # 2.911 GHz - 2,931,883,300 instructions # 2.24 insn per cycle - 0.459998118 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) +TOTAL : 0.469154 sec + 1,285,211,957 cycles # 2.712 GHz + 2,875,140,516 instructions # 2.24 insn per cycle + 0.485058196 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2636) (512y: 96) (512z: 0) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. @@ -194,20 +194,20 @@ OK (relative difference <= 5E-3) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 12.1.0] [inlineHel=0] [hardcodePARAM=1] Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] OMP threads / `nproc --all` = 1 / 4 -EvtsPerSec[Rmb+ME] (23) = ( 2.456308e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.570811e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.570811e+05 ) sec^-1 +EvtsPerSec[Rmb+ME] (23) = ( 2.334432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.451352e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.451352e+05 ) sec^-1 MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 -TOTAL : 0.690659 sec - 1,366,599,933 cycles # 1.966 GHz - 1,970,195,836 instructions # 1.44 insn per cycle - 0.702450567 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) +TOTAL : 0.728206 sec + 1,313,839,367 cycles # 1.794 GHz + 1,915,620,790 instructions # 1.46 insn per cycle + 0.743678029 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1476) (512y: 70) (512z: 2164) ------------------------------------------------------------------------- runExe /data/avalassi/GPU2023/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe [ PASSED ] 6 tests. diff --git a/epochX/cudacpp/tput/teeThroughputX.sh b/epochX/cudacpp/tput/teeThroughputX.sh index bd478452ac..de0a1e912a 100755 --- a/epochX/cudacpp/tput/teeThroughputX.sh +++ b/epochX/cudacpp/tput/teeThroughputX.sh @@ -93,6 +93,8 @@ for arg in $*; do rndgen=$arg elif [ "$arg" == "-curhst" ]; then rndgen=$arg + elif [ "$arg" == "-rorhst" ]; then + rndgen=$arg elif [ "$arg" == "-rmbhst" ]; then rmbsmp=$arg elif [ "$arg" == "-bridge" ]; then diff --git a/epochX/cudacpp/tput/throughputX.sh b/epochX/cudacpp/tput/throughputX.sh index 8160f7fbb9..503d060237 100755 --- a/epochX/cudacpp/tput/throughputX.sh +++ b/epochX/cudacpp/tput/throughputX.sh @@ -187,6 +187,9 @@ while [ "$1" != "" ]; do elif [ "$1" == "-curhst" ]; then rndgen=" -${1}" shift + elif [ "$1" == "-rorhst" ]; then + rndgen=" -${1}" + shift elif [ "$1" == "-rmbhst" ]; then rmbsmp=" -${1}" shift @@ -523,6 +526,7 @@ function cmpExe() { # Profile #registers and %divergence only function runNcu() { + if ! ncu -v > /dev/null 2>&1; then return; fi if [ "${maketype}" == "-dryrun" ]; then return; fi exe=$1 args="$2" @@ -545,6 +549,7 @@ function runNcu() { # See https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/branchstatistics.htm # See https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/sourcelevel/divergentbranch.htm function runNcuDiv() { + if ! ncu -v > /dev/null 2>&1; then return; fi if [ "${maketype}" == "-dryrun" ]; then return; fi exe=$1 args="-p 1 32 1" @@ -567,6 +572,7 @@ function runNcuDiv() { # Profiles sectors and requests function runNcuReq() { + if ! ncu -v > /dev/null 2>&1; then return; fi if [ "${maketype}" == "-dryrun" ]; then return; fi exe=$1 ncuArgs="$2" @@ -580,7 +586,13 @@ function runNcuReq() { set +x } -if nvidia-smi -L > /dev/null 2>&1; then gpuTxt="$(nvidia-smi -L | wc -l)x $(nvidia-smi -L | awk '{print $3,$4}' | sort -u)"; else gpuTxt=none; fi +if nvidia-smi -L > /dev/null 2>&1; then + gpuTxt="$(nvidia-smi -L | wc -l)x $(nvidia-smi -L | awk '{print $3,$4}' | sort -u)" +elif rocm-smi -i > /dev/null 2>&1; then + gpuTxt="$(rocm-smi --showproductname | grep 'Card series' | awk '{print $5,$6,$7}')" +else + gpuTxt=none +fi if [ "${unames}" == "Darwin" ]; then cpuTxt=$(sysctl -h machdep.cpu.brand_string) cpuTxt=${cpuTxt/machdep.cpu.brand_string: }