diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..182ccd4 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,81 @@ +{ + "files.associations": { + "*.icc": "cpp", + "limits": "cpp", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "*.tcc": "cpp", + "bitset": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "cstdint": "cpp", + "deque": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "regex": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "fstream": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "iostream": "cpp", + "istream": "cpp", + "new": "cpp", + "numbers": "cpp", + "ostream": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "cinttypes": "cpp", + "typeinfo": "cpp", + "charconv": "cpp", + "chrono": "cpp", + "condition_variable": "cpp", + "list": "cpp", + "ratio": "cpp", + "future": "cpp", + "mutex": "cpp", + "semaphore": "cpp", + "shared_mutex": "cpp", + "span": "cpp", + "stop_token": "cpp", + "thread": "cpp", + "cfenv": "cpp", + "variant": "cpp", + "format": "cpp", + "any": "cpp", + "source_location": "cpp", + "run_inference_particle_net.C": "cpp", + "test.C": "cpp" + } +} diff --git a/README.md b/README.md index 97902f8..597cb56 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,11 @@ source setup.sh ``` Now ROOT should also access the SOFIE libraries while it runs. This helps to accelerate development. Submit your developments here and we will proceed with the developments in ROOT carefull. - +3. To enable testing generated code with alpaka implementations, build using the following command: +```bash +cmake -Dtesting=ON -DENABLE_ALPAKA_TESTS=ON -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=RelWithDebInfo .. +``` +The default architecture is CUDA, but can be configured using an additional`-DALPAKA_BACKEND=hip` cmake option. ## Inspiration The standalone version of SOFIE is developed with inspiration from the standalone version of RooFit developed by Jonas Rembser that can be found [here](https://github.com/guitargeek/roofit). diff --git a/src/.vscode/settings.json b/src/.vscode/settings.json new file mode 100644 index 0000000..8bc121a --- /dev/null +++ b/src/.vscode/settings.json @@ -0,0 +1,61 @@ +{ + "files.associations": { + "*.icc": "cpp", + "iostream": "cpp", + "ostream": "cpp", + "cctype": "cpp", + "clocale": "cpp", + "cmath": "cpp", + "cstdarg": "cpp", + "cstddef": "cpp", + "cstdio": "cpp", + "cstdlib": "cpp", + "cstring": "cpp", + "ctime": "cpp", + "cwchar": "cpp", + "cwctype": "cpp", + "array": "cpp", + "atomic": "cpp", + "bit": "cpp", + "bitset": "cpp", + "compare": "cpp", + "complex": "cpp", + "concepts": "cpp", + "cstdint": "cpp", + "deque": "cpp", + "map": "cpp", + "set": "cpp", + "string": "cpp", + "unordered_map": "cpp", + "unordered_set": "cpp", + "vector": "cpp", + "exception": "cpp", + "algorithm": "cpp", + "functional": "cpp", + "iterator": "cpp", + "memory": "cpp", + "memory_resource": "cpp", + "numeric": "cpp", + "optional": "cpp", + "random": "cpp", + "regex": "cpp", + "string_view": "cpp", + "system_error": "cpp", + "tuple": "cpp", + "type_traits": "cpp", + "utility": "cpp", + "fstream": "cpp", + "initializer_list": "cpp", + "iomanip": "cpp", + "iosfwd": "cpp", + "istream": "cpp", + "limits": "cpp", + "new": "cpp", + "numbers": "cpp", + "sstream": "cpp", + "stdexcept": "cpp", + "streambuf": "cpp", + "cinttypes": "cpp", + "typeinfo": "cpp" + } +} \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c48e8d1..102ca3b 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,3 +8,4 @@ set(sofie_legacy_eval_backend ON CACHE BOOL "" FORCE) add_subdirectory(SOFIE_core) add_subdirectory(SOFIE_parsers) +add_subdirectory(utils) diff --git a/src/SOFIE_core/CMakeLists.txt b/src/SOFIE_core/CMakeLists.txt index 7297957..4cab8e0 100644 --- a/src/SOFIE_core/CMakeLists.txt +++ b/src/SOFIE_core/CMakeLists.txt @@ -76,6 +76,7 @@ list(TRANSFORM sources_headers PREPEND "inc/") set(sources_cxx src/RModel_Base.cxx src/RModel.cxx + src/RModel_ALPAKA.cxx src/RModel_GNN.cxx src/RModel_GraphIndependent.cxx src/RFunction.cxx @@ -87,18 +88,24 @@ set(sources_cxx target_sources(SOFIE_core PRIVATE ${sources_headers} ${sources_cxx}) target_include_directories(SOFIE_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/inc) +target_link_libraries(SOFIE_core PUBLIC utils) target_link_libraries(SOFIE_core PUBLIC Tree Core RIO ) -ROOT_GENERATE_DICTIONARY(G__SOFIE ${sources_headers} +ROOT_GENERATE_DICTIONARY(G__SOFIE_core ${sources_headers} LINKDEF inc/LinkDef.h MODULE SOFIE_core OPTIONS --deep ) +# Install the dictionaries. +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core_rdict.pcm + ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core.rootmap + DESTINATION lib) + install(TARGETS SOFIE_core LIBRARY DESTINATION lib ) diff --git a/src/SOFIE_core/README.md b/src/SOFIE_core/README.md index 033cad4..2259d7a 100644 --- a/src/SOFIE_core/README.md +++ b/src/SOFIE_core/README.md @@ -25,7 +25,6 @@ SOFIE works in a parser-generator working architecture. With SOFIE, the user get From ROOT command line, or in a ROOT macro, we can proceed with an ONNX model: ```c++ -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; SOFIE::RModel model = parser.Parse(“./example_model.onnx”); model.Generate(); @@ -73,7 +72,6 @@ SOFIE also supports generating inference code with RDataFrame as inputs, refer t Here is the updated list of supported ONNX operators. You can obtain this list by doing ```cpp -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; std::vector supportedOperators = parser.GetRegisteredOperators(); ``` @@ -164,7 +162,6 @@ The above operators are supported for tensors of the following types: You can also check your model whether all operators are implemented by doing the following: ```c++ -using namespace TMVA::Experimental; SOFIE::RModelParser_ONNX parser; parser.CheckModel("example_model.ONNX"); ``` diff --git a/src/SOFIE_core/inc/SOFIE/RFunction.hxx b/src/SOFIE_core/inc/SOFIE/RFunction.hxx index 53c30e3..f79691a 100644 --- a/src/SOFIE_core/inc/SOFIE/RFunction.hxx +++ b/src/SOFIE_core/inc/SOFIE/RFunction.hxx @@ -3,6 +3,7 @@ #include "SOFIE/RModel_Base.hxx" #include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" #include #include diff --git a/src/SOFIE_core/inc/SOFIE/RModel.hxx b/src/SOFIE_core/inc/SOFIE/RModel.hxx index 79541af..50fc231 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel.hxx +++ b/src/SOFIE_core/inc/SOFIE/RModel.hxx @@ -16,14 +16,21 @@ private: int fVerbose = 0; int fBatchSize = -1; long fReadPos = 0; // reading file position + size_t fConstantTensorSize = 0; // size (in Bytes) of the allocated constant tensors + size_t fWeightsTensorSize = 0; // size (in Bytes) of the allocated weight tensors + size_t fOtherTensorSize = 0; // size (in Bytes) of intermediate tensors which are not managed by the memory pool + + OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended; std::unordered_map fInputTensorInfos; // input tensors where shape may not fully defined or other graph inputs? std::unordered_map fReadyInputTensorInfos; // input tensors where shape is full defined std::unordered_map fInitializedTensors; std::unordered_map fIntermediateTensorInfos; std::unordered_map fDynamicTensorInfos; + std::unordered_map, bool>> fShapeTensors; // constant tensors describing a shape std::unordered_map fShapeParams; // parameters defining the dynamic shape (e.g. batch size), store also its default value + std::vector fDimShapeNames; // parameter names used to define the shapes std::vector fOutputTensorNames; std::vector fInputTensorNames; // input tensor names using ONNX order @@ -58,9 +65,14 @@ public: int Verbose() const { return fVerbose;} - const std::vector &GetTensorShape(std::string name) const; - std::vector GetDynamicTensorShape(std::string name) const; - const ETensorType &GetTensorType(std::string name) const; + const std::vector &GetTensorShape(const std::string & name) const; + std::vector GetDimTensorShape(const std::string & name) const; + const ETensorType &GetTensorType(const std::string & name) const; + std::vector GetDynamicTensorShape(const std::string & name) const ; + + // get the values for the tensor representing a shape + const std::vector & GetShapeTensorValues(const std::string & tensor_name) const; + bool CheckIfTensorAlreadyExist(std::string tensor_name); void AddInputTensorInfo(std::string input_name, ETensorType type, std::vector shape); @@ -81,6 +93,7 @@ public: size_t length = ConvertShapeToLength(shape); std::shared_ptr data_ptr(malloc(length * sizeof(T)), free); std::memcpy(data_ptr.get(), (void*) data, length * sizeof(T)); + std::cout<<"Length of constant tensor "<(T()), shape, data_ptr); } // for boolean can be more convenient passing an std::vector @@ -102,6 +115,8 @@ public: AddInitializedTensor(tensor_name, GetTemplatedType(T()), shape, data); } + void AddShapeTensor(const std::string & name, const std::vector & shapeValues, bool scalar = false); + // add and initialize subgraph to the model void InitializeSubGraph(std::shared_ptr graph); @@ -118,13 +133,15 @@ public: bool IsDimInputTensor(const std::string &name) const; // check if tensor is a fully specified input tensor bool IsReadyInputTensor(const std::string &name) const; + /// check if a tensor is a shape tensor + bool IsShapeTensor(const std::string & name) const; // Add intermediate tensor void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector dim_shape); void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector shape); // Add an intermediate dynamic tensor void AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector shape); - + void AddShapeParam(const std::string & name, size_t def_value = 0); void AddInputTensorName(std::string name); void AddOutputTensorNameList(std::vector output_tensor_names); void @@ -132,6 +149,9 @@ public: void UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector shape, std::shared_ptr data); std::shared_ptr GetInitializedTensorData(std::string tensor_name); + void RemoveInitializedTensor(std::string tensor_name); + template + std::vector GetTensorData(const std::string & name); void Initialize(int batchSize = -1, bool verbose = false); void Initialize(const std::map & inputParams, bool verbose = false); @@ -141,34 +161,64 @@ public: { Generate(static_cast>(options), batchSize, pos, verbose); } + void GenerateGPU_ALPAKA(std::underlying_type_t options, int batchSize = -1, bool verbose = false); + void GenerateGPU_ALPAKA(Options options = Options::kDefault, int batchSize = -1, bool verbose = false) + { + GenerateGPU_ALPAKA(static_cast>(options), batchSize, verbose); + } // generate the infer function signature. If isdecl= false generate the calling infer function // used to infer the sub-graphs std::string GenerateInferSignature(bool isdecl = true); + // generate the infer function signature for inference on ALPAKA. If isdecl= false generate the calling infer function + // used to infer the sub-graphs + std::string GenerateInferSignature_GPU_ALPAKA(bool isdecl = true); + + void RemoveIntermediateTensor(const std::string& tensor_name){ + fIntermediateTensorInfos.erase(tensor_name); + } + // calculate total intermediate memory and position intermediate tensor addresses - std::string AllocateIntermediateMemory(std::span op_output_tensors); - void CheckAndFlushIntermediateMemory(std::span op_output_tensors, const size_t& op_idx); + std::string AllocateIntermediateMemory(std::span op_output_tensors); + void CheckAndFlushIntermediateMemory(std::span op_output_tensors, const size_t& op_idx); protected: // internal functions // generate code for the initialized tensors void GenerateInitializedTensorInfo(); + + void GenerateInitializedTensorInfo_GPU_ALPAKA(); // generate code for the intermediate tensors void GenerateIntermediateTensorInfo(); + + // generate code for the temporary initialized tensors containers + void GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA(); + // generate code for the dynamic tensors void GenerateDynamicTensorInfo(); + + void GenerateDynamicTensorInfo_GPU_ALPAKA(); // generate code for declarations needed by operators void GenerateOperatorDeclarations(); // generate code for inference void GenerateOutput(); + + void GenerateOutput_GPU_ALPAKA(); + + void MoveInitializedTensorsToBuffers_ALPAKA(); // generate code for initializing memory pool for intermediate tensors void GenerateIntermediateMemoryPool(); // Generate all session code void GenerateSessionCode(); + void GenerateSessionCode_GPU_ALPAKA(); + void GenerateGPU_ALPAKA_Buffers(); + + void CheckAndFuseOperators(); public: const std::vector &GetInputTensorNames() const { return fInputTensorNames; } const std::vector &GetOutputTensorNames() const { return fOutputTensorNames; } + const std::vector & GetDimShapeNames() const { return fDimShapeNames; } void ReadInitializedTensorsFromFile(long); long WriteInitializedTensorsToFile(std::string filename = ""); @@ -203,6 +253,21 @@ public: ClassDefNV(RModel, 3); }; +template +inline std::vector RModel::GetTensorData(const std::string & name) { + if (!IsInitializedTensor(name)) return std::vector{}; + T * data = static_cast(GetInitializedTensorData(name).get()); + size_t size = ConvertShapeToLength(GetTensorShape(name)); + return std::vector(data, data+size); +} + +template<> +inline std::vector RModel::GetTensorData(const std::string & name) { + if (!IsShapeTensor(name)) return std::vector{}; + return GetShapeTensorValues(name); +} + + } // namespace SOFIE #endif // SOFIE_RMODEL diff --git a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx b/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx index f8a9d34..601e3a9 100644 --- a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx +++ b/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx @@ -12,7 +12,6 @@ #include #include #include "SOFIE/SOFIE_common.hxx" -#include "SOFIE/ROperator.hxx" #include "TBuffer.h" @@ -27,10 +26,26 @@ enum class Options { kGNNComponent = 0x10, }; +// Optimization levels inspired by ONNXRuntime. +// We only get Operator Fusion with the Basic, and +// memory reuse with Extended. kExtended is enabled +// by default +enum class OptimizationLevel { + kBasic = 0x0, + kExtended = 0x1, +}; + enum class WeightFileType { None, RootBinary, Text }; -std::underlying_type_t operator|(Options opA, Options opB); -std::underlying_type_t operator|(std::underlying_type_t opA, Options opB); + +inline std::underlying_type_t operator|(Options opA, Options opB) { + return static_cast>(opA) | + static_cast>(opB); +} + +inline std::underlying_type_t operator|(std::underlying_type_t opA, Options opB) { + return opA | static_cast>(opB); +} class RModel_Base { @@ -53,6 +68,46 @@ protected: bool fIsGNN = false; bool fIsGNNComponent = false; + // Function to generate the code for declaring and initializing constant tensors + // This is for tensors which are not part of weight files and can be created from the Constant operator + template + std::string GenerateConstantTensorCode(const std::pair &t) + { + std::stringstream strs; + std::string type = ConvertTypeToString(t.second.type()); + size_t length = ConvertShapeToLength(t.second.shape()); + std::cout<<"Constant tensor name: "< 100) ? false : true; + + const T *data = t.second.data(); + + // and check if all values are the same + bool sameData = false; + // for non stack allocation check if data are the same + if (!allocateOnStack && length > 1) { + size_t idx = 1; + std::cout<<"insider allocate on stack and length\n"; + do { + std::cout<<"Printing idx: "< fTensor_" << t.first << " = "; + if (sameData) + strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n"; + else { + strs << ConvertValuesToString(length, data) << ";\n"; + } + strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n"; + } + return strs.str(); + } + public: /** Default constructor. Needed to allow serialization of ROOT objects. See @@ -82,6 +137,7 @@ public: fCustomOpHeaders.insert(filename); } void GenerateHeaderInfo(std::string &hgname); + void GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname); void PrintGenerated() { std::cout << fGC; } std::string ReturnGenerated() { return fGC; } diff --git a/src/SOFIE_core/inc/SOFIE/ROperator.hxx b/src/SOFIE_core/inc/SOFIE/ROperator.hxx index edbec58..6c9a812 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator.hxx @@ -2,63 +2,113 @@ #define SOFIE_ROPERATOR #include +#include #include #include "SOFIE/SOFIE_common.hxx" -//#include "RModel.hxx" - - namespace SOFIE{ class RModel; +enum class OperatorKind { + GEMM = 0, + LAYERNORM = 1, + RELU = 2, + CONSTANT = 3, + CONSTANTOFSHAPE = 4, + UNDEFINED = 5, + CONV=6, + BATCHNORM=7, + CAST=8, + COMPARISON=9, + EINSUM=10, + ELU=11, + SIGMOID=12, + TANH=13, + SOFTMAX=14, + LEAKYRELU=15, +}; + +inline const char* toString(OperatorKind kind) { + switch (kind) { + case OperatorKind::GEMM: return "GEMM"; + case OperatorKind::LAYERNORM: return "LAYERNORM"; + case OperatorKind::RELU: return "RELU"; + case OperatorKind::CONSTANT: return "CONSTANT"; + case OperatorKind::CONSTANTOFSHAPE: return "CONSTANTOFSHAPE"; + case OperatorKind::BATCHNORM: return "BATCHNORM"; + case OperatorKind::CONV: return "CONV"; + case OperatorKind::UNDEFINED: return "UNDEFINED"; + default: return "UNKNOWN"; + } +} + +inline std::set FusableKinds = { OperatorKind::RELU, OperatorKind::LAYERNORM, OperatorKind::BATCHNORM}; + class ROperator{ public: virtual std::vector GetBlasRoutines() { return {}; } virtual std::vector GetStdLibs() { return {}; } - virtual std::vector> ShapeInference(std::vector>) = 0; - virtual std::vector TypeInference(std::vector) = 0; + virtual std::vector> ShapeInference(std::vector>) { return {}; }; + virtual std::vector TypeInference(std::vector) { return {}; }; virtual void Initialize(RModel&) = 0; virtual std::string Generate(std::string OpName) = 0; //expect unique opName for each operator within the same RModel + virtual std::string Generate_GPU_ALPAKA(std::string OpName){ return "";} //expect unique opName for each operator within the same RModel // generate initialization code for session constructor virtual std::string GenerateInitCode() { return "";} + virtual std::string GenerateInitCode_GPU_ALPAKA() { return "";}; // generate some specific declaration code for Session virtual std::string GenerateDeclCode() { return "";} // generate session data members specific to operator virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; } + virtual std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { return ""; } + virtual std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) { return ""; } virtual std::string Header() { return "";} + virtual std::string GetFusableOutputTensorName() { return "";} + virtual std::string GetBlasConfig() { return ""; } + virtual void UpdateFusableTensorName(std::string, const std::function& removal_func){ return;}; //virtual void Forward_reference() = 0; //virtual void Forward_blas() = 0; virtual ~ROperator(){} protected: - + OperatorKind fKind = OperatorKind::UNDEFINED; + size_t fOpOrder = 0; const std::string SP = " "; ///< space used to correctly indent the generated C++ code bool fUseSession = false; ///< flag to identify if using the session class bool fIsOutputConstant = false; ///< flag to identify if operator has a constant output (no need to generate code) - - mutable std::vector fInputTensorNames; - mutable std::vector fOutputTensorNames; + bool fIsOutputParamShape = false; ///< flag to identify of the output represents a parametric shape (can be knwon at compile time) + + mutable std::vector fInputTensorNames; + mutable std::vector fOutputTensorNames; public: - std::span GetOpInputTensors() const { + std::span GetOpInputTensors() const { return fInputTensorNames; } - std::span GetOpOutputTensors() const { + std::span GetOpOutputTensors() const { return fOutputTensorNames; } - + + OperatorKind GetKind() const { return fKind; } + + void RegisterOperatorOrder(const size_t ord){ + fOpOrder = ord; + } + size_t GetOpOrder(){ + return fOpOrder; + } + }; }//SOFIE - #endif //SOFIE_OPERATOR diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx index 127eaff..85953d5 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx @@ -1,5 +1,5 @@ -#ifndef SOFIE_ROperator_BasicBinary -#define SOFIE_ROperator_BasicBinary +#ifndef SOFIE_SOFIE_ROperator_BasicBinary +#define SOFIE_SOFIE_ROperator_BasicBinary #include "SOFIE/SOFIE_common.hxx" #include "SOFIE/ROperator.hxx" @@ -7,9 +7,15 @@ #include -namespace SOFIE{ +namespace SOFIE { -enum EBasicBinaryOperator { Add, Sub, Mul, Div, Pow }; +enum EBasicBinaryOperator { + Add, + Sub, + Mul, + Div, + Pow +}; template struct BinaryOperatorTrait {}; @@ -17,42 +23,42 @@ struct BinaryOperatorTrait {}; template struct BinaryOperatorTrait { static const std::string Name() { return "Add"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " + " + t2; } - static T Func(T t1, T t2) {return t1 + t2;} + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " + " + t2; } + static T Func(T t1, T t2) { return t1 + t2; } }; template struct BinaryOperatorTrait { static const std::string Name() { return "Sub"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " - " + t2; } - static T Func (T t1, T t2) { return t1 - t2;} + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " - " + t2; } + static T Func(T t1, T t2) { return t1 - t2; } }; template struct BinaryOperatorTrait { static const std::string Name() { return "Mul"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " * " + t2; } - static T Func (T t1, T t2) { return t1 * t2;} + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " * " + t2; } + static T Func(T t1, T t2) { return t1 * t2; } }; template struct BinaryOperatorTrait { static const std::string Name() { return "Div"; } - static std::string Op(const std::string & t1, const std::string t2) { return t1 + " / " + t2; } - static T Func (T t1, T t2) { return t1/t2;} + static std::string Op(const std::string &t1, const std::string t2) { return t1 + " / " + t2; } + static T Func(T t1, T t2) { return t1 / t2; } }; template struct BinaryOperatorTrait { static const std::string Name() { return "Pow"; } - static std::string Op(const std::string & t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; } - static T Func (T t1, T t2) { return std::pow(t1,t2);} + static std::string Op(const std::string &t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; } + static T Func(T t1, T t2) { return std::pow(t1, t2); } }; -template -class ROperator_BasicBinary final : public ROperator{ +template +class ROperator_BasicBinary final : public ROperator { private: - + int fBroadcastFlag = 0; std::string fNA; std::string fNB; std::string fNBroadcastedA; @@ -63,154 +69,444 @@ private: std::vector fShapeB; std::vector fShapeY; + std::vector fDimShapeA; + std::vector fDimShapeB; + std::vector fDimShapeY; + public: - ROperator_BasicBinary(){} - ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY): - fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNA, fNB }; - fOutputTensorNames = { fNY }; - } + ROperator_BasicBinary() {} + ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY) + : fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = {fNA, fNB}; + fOutputTensorNames = {fNY}; + } // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } + std::vector TypeInference(std::vector input) override { return input; } // shape of output tensors given input tensors - std::vector> ShapeInference(std::vector> input) override { + std::vector> ShapeInference(std::vector> input) override + { // assume now inputs have same shape (no broadcasting) auto ret = std::vector>(1, input[0]); // return vector size 1 with first input return ret; } - void Initialize(RModel& model) override { + void Initialize(RModel &model) override + { // input must be a graph input, or already initialized intermediate tensor - if (!model.CheckIfTensorAlreadyExist(fNA)){ + if (!model.CheckIfTensorAlreadyExist(fNA)) { throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNA + "is not found in model"); } if (!model.CheckIfTensorAlreadyExist(fNB)) { throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNB + "is not found in model"); } - fShapeA = model.GetTensorShape(fNA); - fShapeB = model.GetTensorShape(fNB); - bool broadcast = !UTILITY::AreSameShape(fShapeA, fShapeB); - if (broadcast) { - // Y is the common shape of A and B - fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeA, fShapeB); - bool broadcastA = !UTILITY::AreSameShape(fShapeA, fShapeY); - bool broadcastB = !UTILITY::AreSameShape(fShapeB, fShapeY); - // Broadcast A to Y - if (broadcastA) { - fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY; - if (model.IsInitializedTensor(fNA)) { - auto data = model.GetInitializedTensorData(fNA); - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), - std::default_delete()); - // Update the data and the shape of A - model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); - fShapeA = fShapeY; + int dynamicInputs = 0; + if (model.IsDynamicTensor(fNA)) { + fDimShapeA = model.GetDimTensorShape(fNA); + dynamicInputs |= 1; + } else { + fShapeA = model.GetTensorShape(fNA); + fDimShapeA = ConvertShapeToDim(fShapeA); + } + if (model.IsDynamicTensor(fNB)) { + dynamicInputs |= 2; + fDimShapeB = model.GetDimTensorShape(fNB); + } else { + fShapeB = model.GetTensorShape(fNB); + fDimShapeB = ConvertShapeToDim(fShapeB); + } + if (dynamicInputs & 1 && model.Verbose()) + std::cout << BinaryOperatorTrait::Name() << " : input " << fNA << " is dynamic " + << ConvertDimShapeToString(fDimShapeA) << " "; + if (dynamicInputs & 2 && model.Verbose()) + std::cout << BinaryOperatorTrait::Name() << " : input " << fNB << " is dynamic " + << ConvertDimShapeToString(fDimShapeB) << " "; + std::cout << std::endl; + // check if need to broadcast at initialization time if shapes are known and different + // (we could broadcast the tensor tensor to maximum values of dynamic shapes - to be done) + // case of known shapes + // if shapes are known find the output shape from broadcasting + if (dynamicInputs == 0) { + auto ret = UTILITY::MultidirectionalBroadcastShape(fShapeA, fShapeB); + fBroadcastFlag = ret.first; + fShapeY = ret.second; + if (model.IsConstantTensor(fNA) && model.IsConstantTensor(fNB)) { + bool broadcast = fBroadcastFlag > 0; + if (broadcast) { + // Y is the common shape of A and B + bool broadcastA = fBroadcastFlag & 2; + bool broadcastB = fBroadcastFlag & 1; + // Broadcast A to Y + if (broadcastA) { + fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY; + auto data = model.GetInitializedTensorData(fNA); + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), + std::default_delete()); + if (model.Verbose()) + std::cout << "broadcasted data A " << ConvertShapeToString(fShapeY) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeY), + static_cast(broadcastedData.get())) + << std::endl; + // Update the data and the shape of A + model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); + fShapeA = fShapeY; + fDimShapeA = ConvertShapeToDim(fShapeA); + } + // Broadcast B to Y + if (broadcastB) { + fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY; + auto data = model.GetInitializedTensorData(fNB); + if (model.Verbose()) + std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast(data.get())) + << std::endl; + std::shared_ptr broadcastedData( + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), + std::default_delete()); + // do not update tensor B but add broadcasted one (since it can be input to some other operators) + if (model.Verbose()) + std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " + << ConvertValuesToString(ConvertShapeToLength(fShapeY), + static_cast(broadcastedData.get())) + << std::endl; + model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); + fShapeB = fShapeY; + fDimShapeB = ConvertShapeToDim(fShapeB); + } } else { - // Add an intermediate tensor for broadcasting A - model.AddIntermediateTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY); + fShapeY = fShapeA; } - } - // Broadcast B to Y - if (broadcastB) { - fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY; - if (model.IsInitializedTensor(fNB)) { - auto data = model.GetInitializedTensorData(fNB); - std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast(data.get())) << std::endl; - std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), - std::default_delete()); - // do not update tensor B but add broadcasted one (since it can be input to some other operators) - std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeY), static_cast(broadcastedData.get())) << std::endl; - model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); - fShapeB = fShapeY; - } else { - // Add an intermediate tensor for broadcasting B - model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY); + // tensors are constant: perform here the binary operation + + const std::string &nameA = fNBroadcastedA.empty() ? fNA : fNBroadcastedA; + const std::string &nameB = fNBroadcastedB.empty() ? fNB : fNBroadcastedB; + auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); + auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); + std::vector dataY(ConvertShapeToLength(fShapeY)); + for (size_t i = 0; i < dataY.size(); i++) { + dataY[i] = BinaryOperatorTrait::Func(dataA[i], dataB[i]); + } + model.AddConstantTensor(fNY, fShapeY, dataY.data()); + // flag tensors to not be written in the weight file + model.SetNotWritableInitializedTensor(nameA); + model.SetNotWritableInitializedTensor(nameB); + fIsOutputConstant = true; + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << fNA << " " << ConvertShapeToString(fShapeA) + << " , " << fNB << " " << ConvertShapeToString(fShapeB) << " ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(dataY) << std::endl; + } + } else { + // case of defined and non-constant tensors + model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << fNA << " " << ConvertShapeToString(fShapeA) + << " , " << fNB << " " << ConvertShapeToString(fShapeB) << " ---> " << fNY << " " + << ConvertShapeToString(fShapeY) << std::endl; } + // we convert non-dim shapes to Dim shapes + fDimShapeY = ConvertShapeToDim(fShapeY); } } else { - fShapeY = fShapeA; - } - // check case of constant output (if all inputs are defined) - if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB)) { - const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); - auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); - std::vector dataY(ConvertShapeToLength(fShapeY)); - for (size_t i = 0; i < dataY.size(); i++) { - dataY[i] = BinaryOperatorTrait::Func(dataA[i], dataB[i]); + // case A or B have dynamic shapes. We need to broadcast if shape are not same + auto ret = UTILITY::MultidirectionalBroadcastShape(fDimShapeA, fDimShapeB); + fBroadcastFlag = ret.first; + fDimShapeY = ret.second; + // case of all parametric shapes and MultiDirectionalBroadcastShape return the max of the 2 + // need to do before we declare the output tensor shape and the broadcasted ones + if (ret.first & 4) { + // check if one of the parameter is an input dimension + // define function to find this + auto IsInputDimParam = [&](const std::string &p) { + auto inputNames = model.GetInputTensorNames(); + for (auto &input : inputNames) { + for (auto &i_s : model.GetDimTensorShape(input)) { + if (i_s.isParam && i_s.param == p) + return true; + } + } + return false; + }; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + auto &s = fDimShapeY[i]; + if (s.isParam && s.param.find("std::max") != std::string::npos) { + if (IsInputDimParam(fDimShapeA[i].param)) { + // case dim is 1 we indicate that the input parameter is equal to 1 + if (fDimShapeA[i].dim != 1) + s = fDimShapeA[i]; + else + s = fDimShapeB[i]; + } else if (IsInputDimParam(fDimShapeB[i].param)) { + if (fDimShapeB[i].dim != 1) + s = fDimShapeB[i]; + else + s = fDimShapeA[i]; + } + } + } + } + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fDimShapeY); + if (model.Verbose()) { + std::cout << BinaryOperatorTrait::Name() << " : " << ConvertDimShapeToString(fDimShapeA) << " , " + << ConvertDimShapeToString(fDimShapeB) << " --> " << ConvertDimShapeToString(fDimShapeY) << std::endl; } - model.AddConstantTensor(fNY, fShapeY, dataY.data()); - // flag tensors to not be written in a fil - model.SetNotWritableInitializedTensor(nameA); - model.SetNotWritableInitializedTensor(nameB); - fIsOutputConstant = true; - if (model.Verbose()) - std::cout << "Binary op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << " : " - << ConvertValuesToString(dataY) << std::endl; - } - else { - model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); } } - std::string GenerateInitCode() override { + std::string GenerateInitCode() override + { std::stringstream out; return out.str(); } - std::string Generate(std::string OpName) override { + std::string Generate(std::string opName) override + { - if (fIsOutputConstant) return ""; + if (fIsOutputConstant) + return ""; - OpName = "op_" + OpName; + opName = "op_" + opName; - if (fShapeY.empty()) { + if (fDimShapeY.empty()) { throw std::runtime_error("TMVA SOFIE Binary Op called to Generate without being initialized first"); } std::stringstream out; - out << SP << "\n//------ " << BinaryOperatorTrait::Name() << "\n"; - size_t length = ConvertShapeToLength(fShapeY); + out << SP << "\n//------ " << opName << " " << BinaryOperatorTrait::Name() << " --> " + << ConvertDimShapeToString(fDimShapeY) << "\n"; + auto length = ConvertDimShapeToLength(fDimShapeY); std::string typeName = TensorType::Name(); - // Broadcast A if it's uninitialized - // use broadcasting function where we pass an already allocated tensor to minimize memory allocations - if (fShapeA != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedA << ");\n"; - } - // Broadcast B if it's uninitialized - if (fShapeB != fShapeY) { - out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedB << ");\n"; - } - const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; - out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = " << BinaryOperatorTrait::Op( "tensor_" + nameA + "[id]" , "tensor_" + nameB + "[id]") << " ;\n"; - out << SP << "}\n"; + + // we need to check if we can broadcast (case flag has bit 4 set) + + if (fBroadcastFlag & 4) { + // need to check if shapes are the same + auto lengthA = ConvertDimShapeToLength(fDimShapeA); + auto lengthB = ConvertDimShapeToLength(fDimShapeB); + out << SP << "if (" << lengthA << "!=" << lengthB << ") {\n"; + // check if A->B or B->A + // bool broadcastable = true; + for (size_t i = 0; i < fDimShapeY.size(); i++) { + if (fBroadcastFlag & 5 && fDimShapeY[i] == fDimShapeA[i] && fDimShapeA[i].dim > 1 && + fDimShapeB[i].isParam) { + // B->A B[i] needs to be 1 + out << SP << SP << "if (" << fDimShapeB[i] << "!= 1)\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast B->A in operator " + << opName << "\");\n"; + } + if (fBroadcastFlag & 6 && fDimShapeY[i] == fDimShapeB[i] && fDimShapeB[i].dim > 1 && + fDimShapeA[i].isParam) { + // A-> B A[i] needs to be 1 + out << SP << SP << "if (" << fDimShapeA[i] << "!= 1)\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast A->B in operator " + << opName << "\");\n"; + } else if (fDimShapeA[i].isParam && fDimShapeB[i].isParam) { + // both shapes are parametric and we broadcast to maximum + // we allocate here output vector + out << SP << SP << "if (" << fDimShapeA[i] << " != " << fDimShapeB[i] << " && (" << fDimShapeA[i] + << " != 1 || " << fDimShapeB[i] << " != 1))\n"; + out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast shapes in operator " << opName + << "\");\n"; + } + } + out << SP << "}\n"; + } + + auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeA); + auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeB); + auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY); + + std::string compute_idx_A, compute_idx_B, compute_idx_Y; + if (fDimShapeA.empty() || + std::all_of(fDimShapeA.begin(), fDimShapeA.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_A = "0"; + } else { + for (size_t i = 0; i < fDimShapeA.size(); ++i) { + if (fDimShapeA[i].dim == 1 || fDimShapeA[i].GetVal() == "1") + continue; + compute_idx_A += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeA.size())); + if (stridesA[i].GetVal() != "1") + compute_idx_A += " * " + stridesA[i].GetVal(); + compute_idx_A += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_A.pop_back(); + } + if (fDimShapeB.empty() || + std::all_of(fDimShapeB.begin(), fDimShapeB.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_B = "0"; + } else { + for (size_t i = 0; i < fDimShapeB.size(); ++i) { + if (fDimShapeB[i].dim == 1 || fDimShapeB[i].GetVal() == "1") + continue; + compute_idx_B += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeB.size())); + if (stridesB[i].GetVal() != "1") + compute_idx_B += " * " + stridesB[i].GetVal(); + compute_idx_B += " + "; + } + // remove last 3 character " + " + for (int j = 0; j < 3; j++) + compute_idx_B.pop_back(); + } + int nloop = 0; + if (fDimShapeY.empty() || + std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) { + compute_idx_Y = "0"; + } else { + for (size_t i = 0; i < fDimShapeY.size(); ++i) { + if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") { + nloop++; + for (int j = 0; j < nloop; j++) out << SP; + out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i] + << "; ++idx_" << i << "){\n"; + compute_idx_Y += "idx_" + std::to_string(i); + if (stridesY[i].GetVal() != "1") + compute_idx_Y += " * " + stridesY[i].GetVal(); + compute_idx_Y += " + "; + } + } + // remove last 3 characters " + " + for (int j = 0; j < 3; j++) + compute_idx_Y.pop_back(); + } + for (int j = 0; j < nloop + 1; j++) out << SP; + out << "tensor_" << fNY << "[" << compute_idx_Y << "] = " + << BinaryOperatorTrait::Op("tensor_" + fNA + "[" + compute_idx_A + "]", + "tensor_" + fNB + "[" + compute_idx_B + "]") + << " ;\n"; + + for (int i = nloop; i > 0; i--) { + for (int j = 0; j < i; j++) out << SP; + out << "}\n"; + } return out.str(); } - std::vector GetStdLibs() override { + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) { + if (fIsOutputConstant) + return ""; + + std::string op; + op = "\n//------ "+opName+"_"+BinaryOperatorTrait::Name()+"_KERNEL_ALPAKA\n"; + op += SP + "struct Binary"+opName+BinaryOperatorTrait::Name()+"Kernel {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * A, T const * B, T * C) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < " + ConvertShapeToLength(fShapeY) + ") {\n"; + auto stridesA = UTILITY::ComputeStrideFromShape(fShapeA); + auto stridesB = UTILITY::ComputeStrideFromShape(fShapeB); + + for(size_t id_s = 0; id_s < stridesA.size(); ++id_s){ + if(fShapeA[id_s] == 1) + stridesA[id_s] = 0; + } + + for(size_t id_s = 0; id_s < stridesB.size(); ++id_s){ + if(fShapeB[id_s] == 1) + stridesB[id_s] = 0; + } + + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + std::string flattened_index_A = ""; + std::string flattened_index_B = ""; + std::string temp = "idx"; + + op += "// stridesY " + ConvertShapeToString(stridesY) + "\n"; + op += "// stridesA " + ConvertShapeToString(stridesA) + "\n"; + op += "// stridesB " + ConvertShapeToString(stridesB) + "\n"; + + for (size_t id_s = 0; id_s < fShapeA.size(); ++id_s) { + + auto strideY = stridesY[id_s]; + auto strideA = stridesA[id_s]; + + // coord expression + std::string coord = "(int)(" + temp + " / " + std::to_string(strideY) + ")"; + + // accumulate into final index + flattened_index_A += coord + " * " + std::to_string(strideA) + " + "; + + // update temp correctly + temp = temp + " - (" + coord + " * " + std::to_string(strideY) + ")"; + } + + // remove trailing " + " + if (!flattened_index_A.empty()) + flattened_index_A.erase(flattened_index_A.size() - 3); + + temp = "idx"; + + for (size_t id_s = 0; id_s < fShapeB.size(); ++id_s) { + + auto strideY = stridesY[id_s]; + auto strideB = stridesB[id_s]; + + // coord expression + std::string coord = "(int)(" + temp + " / " + std::to_string(strideY) + ")"; + + // accumulate into final index + flattened_index_B += coord + " * " + std::to_string(strideB) + " + "; + + // update temp correctly + temp = temp + " - (" + coord + " * " + std::to_string(strideY) + ")"; + } + + // remove trailing " + " + if (!flattened_index_B.empty()) + flattened_index_B.erase(flattened_index_B.size() - 3); + + + op += "C[idx] = " + BinaryOperatorTrait::Op("A["+flattened_index_A+"]", "B["+flattened_index_B+"]") + ";\n"; + op += "}\n}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) { + if (fIsOutputConstant) + return ""; + + return SP + "Binary"+OpName+BinaryOperatorTrait::Name()+"Kernel binary" + OpName + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) { + if (fIsOutputConstant) + return ""; + + if (fDimShapeY.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Basic Binary called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDimShapeToLength(fDimShapeY); + out << "\n//------ "+OpName+"_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"< const kernelCfg_" << fNY << " = {elementsPerGrid_" << fNY << ", elementsPerThread_" << fNY << "};\n"; + out << SP << "auto const workDiv_" << fNY << " = alpaka::getValidWorkDiv(kernelCfg_" << fNY << ", devAcc, binary" << OpName << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNA + << "), alpaka::getPtrNative(deviceBuf_" << fNB << "), alpaka::getPtrNative(deviceBuf_" << fNY << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNY + << ", binary" << OpName << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNA + << "), alpaka::getPtrNative(deviceBuf_" << fNB << "), alpaka::getPtrNative(deviceBuf_" << fNY << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + return out.str(); + } + + std::vector GetStdLibs() override + { if (Op == EBasicBinaryOperator::Pow) { - return { std::string("cmath") }; + return {std::string("cmath")}; } else { return {}; } } -}; -}//SOFIE + +}; +} // namespace SOFIE -#endif //SOFIE_ROperator_BasicBinary +#endif // SOFIE_ROperator_BasicBinary diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx index c18c17e..b98ded5 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx @@ -107,6 +107,33 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA() { + std::string op; + op = "\n//------ " + UnaryOpTraits::Name() + "_KERNEL_ALPAKA\n"; + op += SP + "struct Unary" + UnaryOpTraits::Name() + "Kernel{\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T* data, std::size_t numElements) const {\n"; + op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n"; + op += SP + SP + SP + "data[i] = " << UnaryOpTraits::Op("data[i]") << ";\n"; + op += SP + SP + "}\n"; + op += SP + "}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "Unary" + UnaryOpTraits::Name() + "Kernel " + UnaryOpTraits::Name() + "Kernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + std::stringstream out; + auto length = ConvertShapeToLength(fShapeX); + out << "\n//------ "+OpName+"_ALPAKA\n"; + out << SP << "alpaka::WorkDivMembers workDiv_"<::all("<<(length+255)/256<<"), alpaka::Vec::all(256), alpaka::Vec::all(1));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNX << ", " << UnaryOpTraits::Name() << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), static_cast(" << length << ")); \n"; + return out.str(); + } + std::vector GetStdLibs() override { if (Op == EBasicUnaryOperator::kSqrt || Op == EBasicUnaryOperator::kExp || Op == EBasicUnaryOperator::kLog) { return { std::string("cmath") }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx index a27cea4..1a6098d 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx @@ -1,9 +1,9 @@ #ifndef SOFIE_ROPERATOR_BatchNormalization #define SOFIE_ROPERATOR_BatchNormalization -#include "SOFIE_common.hxx" -#include "ROperator.hxx" -#include "RModel.hxx" +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" #include diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx index 47c3d66..2cb797b 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx @@ -10,6 +10,14 @@ namespace SOFIE{ +template +std::vector convertToInt64(const In* src, size_t n) { + std::vector dst(n); + std::transform(src, src + n, dst.begin(), + [](In v) { return static_cast(v); }); + return dst; +} + class ROperator_Cast final : public ROperator { @@ -26,6 +34,7 @@ public: ROperator_Cast(std::string attr_type,std::string nameX, std::string nameY): fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)), fAttrType(attr_type) { + fKind = OperatorKind::CAST; fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; } @@ -47,11 +56,67 @@ public: fShape = model.GetTensorShape(fNX); // shoud we add a check if the same type auto inputType = model.GetTensorType(fNX); + const size_t n = ConvertShapeToLength(fShape); if (model.IsInitializedTensor(fNX)) { fIsOutputConstant = true; auto inputData = model.GetInitializedTensorData(fNX); if (ConvertStringToType(fAttrType) == ETensorType::INT64) { - model.AddConstantTensor(fNY, fShape, static_cast(inputData.get())); + auto inputTypeStr = ConvertTypeToString(inputType); + if (inputTypeStr == "int32_t") { + auto* src = static_cast(inputData.get()); + auto converted = convertToInt64(src, n); + model.AddConstantTensor(fNY, fShape, converted); + } + else if (inputTypeStr == "float") { + auto* src = static_cast(inputData.get()); + auto converted = convertToInt64(src, n); + model.AddConstantTensor(fNY, fShape, converted); + } + else if (inputTypeStr == "double") { + auto* src = static_cast(inputData.get()); + auto converted = convertToInt64(src, n); + model.AddConstantTensor(fNY, fShape, converted); + } + else if (inputTypeStr == "int8_t") { + auto* src = static_cast(inputData.get()); + auto converted = convertToInt64(src, n); + model.AddConstantTensor(fNY, fShape, converted); + } + else if (inputTypeStr == "int16_t") { + auto* src = static_cast(inputData.get()); + auto converted = convertToInt64(src, n); + model.AddConstantTensor(fNY, fShape, converted); + } + else if (inputTypeStr == "uint8_t") { + auto* src = static_cast(inputData.get()); + auto converted = convertToInt64(src, n); + model.AddConstantTensor(fNY, fShape, converted); + } + else if (inputTypeStr == "uint16_t") { + auto* src = static_cast(inputData.get()); + auto converted = convertToInt64(src, n); + model.AddConstantTensor(fNY, fShape, converted); + } + else if (inputTypeStr == "uint32_t") { + auto* src = static_cast(inputData.get()); + auto converted = convertToInt64(src, n); + model.AddConstantTensor(fNY, fShape, converted); + } + else if (inputTypeStr == "uint64_t") { + auto* src = static_cast(inputData.get()); + auto converted = convertToInt64(src, n); + model.AddConstantTensor(fNY, fShape, converted); + } + else if (inputTypeStr == "int64_t") { + model.AddConstantTensor( + fNY, fShape, + static_cast(inputData.get()) + ); + } + else { + throw std::runtime_error("Unsupported input type for INT64 conversion"); + } + model.SetNotWritableInitializedTensor(fNX); } else @@ -90,6 +155,45 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + std::string op; + op = "\n//------ CAST_KERNEL_ALPAKA\n"; + op += SP + "struct CastKernel"+opName+"{\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, SrcT const * src, DstT * dst, std::size_t numElements) const {\n"; + op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n"; + op += SP + SP + SP + "dst[i] = static_cast(src[i]);\n"; + op += SP + SP + "}\n"; + op += SP + "}\n};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + return SP + "CastKernel"+opName+" castKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + if (fIsOutputConstant) return ""; + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Cast called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertShapeToLength(fShape); + out << "\n//------ CAST_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"< const kernelCfg_" << fNY << " = {elementsPerGrid_" << fNY << ", elementsPerThread_" << fNY << "};\n"; + out << SP << "auto const workDiv_" << fNY << " = alpaka::getValidWorkDiv(kernelCfg_" << fNY << ", devAcc, castKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNY << ", castKernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << ")); \n"; + out << SP <<"alpaka::wait(queue);\n"; + return out.str(); + } + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx index 7648a9a..a00ed28 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx @@ -73,6 +73,7 @@ public: ROperator_Comparision(){} ROperator_Comparision(const std::string & nameX1, const std::string & nameX2, const std::string & nameY): fNX1(UTILITY::Clean_name(nameX1)), fNX2(UTILITY::Clean_name(nameX2)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::COMPARISON; fInputTensorNames = { fNX1, fNX2 }; // output will be a boolean vector so should not be considered for memory optimized pool diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx index 0d5e574..10d6d0d 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx @@ -1,5 +1,5 @@ #ifndef SOFIE_ROPERATOR_Concat - #define SOFIE_ROPERATOR_Concat +#define SOFIE_ROPERATOR_Concat #include "SOFIE/SOFIE_common.hxx" @@ -23,8 +23,10 @@ std::string fOutput; std::vectorfOutputShape; std::vector> fInputShapes; + ETensorType fInputType; public: + ROperator_Concat(){} ROperator_Concat(std::vector inputs, int axis, int newAxis, std::string output): fAxis(axis), fnewAxis(newAxis), fOutput(UTILITY::Clean_name(output)) { @@ -53,6 +55,7 @@ throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value "); int concat_dim=0; + // case of Concat (fNewAxis = 0) and not ConcatFromSequence if(fnewAxis == 0){ for (size_t i = 0; i < inputs.size(); i++) { if (i > 0 && inputs[i].size() != inputs[i - 1].size()) @@ -73,6 +76,7 @@ ret[0][fAxis] = concat_dim; } std::vector stack; + // case ConCatFromSequence if(fnewAxis == 1){ for(size_t i = 0; i < inputs.size(); i++) { if (i > 0 && inputs[i].size() != inputs[i-1].size() ) @@ -96,8 +100,8 @@ } // get shape of output given inputs. It is going to be called after initialized - std::vector> ShapeInference(const std::vector> & inputs) { - std::vector> ret(1); + std::vector ShapeInference(const std::vector> & inputs, const RModel & model) { + std::vector ret(inputs[0].size()); // treat negative axis case if (fAxis<0) { fAxis = inputs[0].size()+fAxis; @@ -105,31 +109,54 @@ if (fAxis < 0 || fAxis >= (int) inputs[0].size()) throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value "); - int concat_dim=0; + Dim concat_dim; if(fnewAxis == 0){ for (size_t i = 0; i < inputs.size(); i++) { if (i > 0 && inputs[i].size() != inputs[i - 1].size()) throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " + - ConvertDynamicShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDynamicShapeToString(inputs[i - 1])); + ConvertDimShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDimShapeToString(inputs[i - 1])); for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) { if ((int)iaxis == fAxis) { - // support only non-params shape for the concatenation axis - if (inputs[i][iaxis].isParam) - throw std::runtime_error("TMVA SOFIE Concat Op - not supporting input param dimensions for concatenation axis. Input shape is " + - ConvertDynamicShapeToString(inputs[i])); - concat_dim += inputs[i][iaxis].dim; + // support both integer and params shape for the concatenation axis + if (concat_dim.param.empty() && concat_dim.dim == 0) + concat_dim = inputs[i][iaxis]; + else if (inputs[i][iaxis].isParam || concat_dim.isParam) { + concat_dim = + Dim{ concat_dim.GetVal() + std::string(" + ") + inputs[i][iaxis].GetVal(), + static_cast(-1)}; + } else { + concat_dim = Dim { concat_dim.dim + inputs[i][iaxis].dim }; + } + } + else if (i == 0) { + ret[iaxis] = inputs[i][iaxis]; } - // other dimensions must be the same - else if (i > 0 && inputs[i][iaxis].GetVal() != inputs[i - 1][iaxis].GetVal()) + else if ((!inputs[i][iaxis].isParam && !ret[iaxis].isParam) && (inputs[i][iaxis].dim != ret[iaxis].dim)) { throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " + - ConvertDynamicShapeToString(inputs[i]) + " and " + - ConvertDynamicShapeToString(inputs[i - 1])); + ConvertDimShapeToString(inputs[i]) + " and " + + ConvertDimShapeToString(inputs[i - 1])); + } + else if (!inputs[i][iaxis].isParam && ret[iaxis].isParam){ + // if shape is not parametric use it + ret[iaxis] = inputs[i][iaxis]; + } + else if (inputs[i][iaxis].isParam && ret[iaxis].isParam) { + // check which parameter is first in RModel list + auto & dimNames = model.GetDimShapeNames(); + auto p1 = std::find(dimNames.begin(), dimNames.end(), inputs[i][iaxis].param); + auto p2 = std::find(dimNames.begin(), dimNames.end(), ret[iaxis].param); + if (p1 < p2) ret[iaxis] = inputs[i][iaxis]; + } + } + // add parenthesis in case is an expression + if (concat_dim.isParam && concat_dim.dim == static_cast(-1)) + concat_dim = Dim{ std::string("(") + concat_dim.GetVal() + std::string(")"), concat_dim.dim }; } - // output shape - ret[0] = inputs[0]; - ret[0][fAxis].dim = concat_dim; + // output shape for concatenated axis + ret[fAxis] = concat_dim; + } // case of stacking (not supported yet) // here we need to check that input shapes are the same @@ -141,24 +168,31 @@ return ret; } - void Initialize(RModel& model) override { + void Initialize(RModel& model) override { for (auto &it : fInputs) { if (model.CheckIfTensorAlreadyExist(it) == false) { throw std::runtime_error("TMVA SOFIE Concat Op Input Tensor " + it + " is not found in model"); } - fInputShapes.push_back(model.GetDynamicTensorShape(it)); + fInputShapes.push_back(model.GetDimTensorShape(it)); } - fOutputShape = ShapeInference(fInputShapes)[0]; + fOutputShape = ShapeInference(fInputShapes, model); if (model.Verbose()) - std::cout << "Output of concat operator has shape " << ConvertDynamicShapeToString(fOutputShape) << std::endl; + std::cout << "Output of concat operator has shape " << ConvertDimShapeToString(fOutputShape) << std::endl; // check if concat has constant inputs , axis 0(concat contigous memory and type is integer) + bool isOutputShape = false; + fInputType = model.GetTensorType(fInputs[0]); if (model.GetTensorType(fInputs[0]) == ETensorType::INT64 && fAxis == 0) { fIsOutputConstant = true; + isOutputShape = true; + for ( auto & input : fInputs) { if (!model.IsInitializedTensor(input)) { fIsOutputConstant = false; - break; + if (!model.IsShapeTensor(input)) { + isOutputShape = false; + break; + } } } if (fIsOutputConstant) { @@ -171,32 +205,64 @@ size_t inputLength = ConvertShapeToLength(inputShape); std::copy(inputData, inputData + inputLength, outputData.begin() + offset ); offset += inputLength; - // data do not need to be written as a weight + // the data of the input tensor don't need to be written in the generated code and data file model.SetNotWritableInitializedTensor(input); } model.AddConstantTensor(fOutput, outputShape, outputData.data()); if (model.Verbose()) { std::cout << "output of Concat is a constant tensor " << ConvertShapeToString(outputShape) << " : " - << ConvertValuesToString(outputData) << std::endl; + << ConvertValuesToString(outputData) << " (constant)" << std::endl; } + } else if (isOutputShape) { + auto outputShape = ConvertShapeToInt(fOutputShape); // conversion must be possible + std::vector outputData(ConvertShapeToLength(outputShape)); + size_t offset = 0; + for ( auto & input : fInputs) { + std::vector inputData; + auto inputShape = model.GetTensorShape(input); // shape is not dynamic + size_t inputLength = ConvertShapeToLength(inputShape); // shape can be a scalar + if (model.IsShapeTensor(input)) { + inputData = model.GetShapeTensorValues(input); + } else if (model.IsInitializedTensor(input)) { + inputData.resize(inputLength); + auto intData = static_cast(model.GetInitializedTensorData(input).get()); + for (size_t i = 0; i < inputData.size(); i++) + inputData[i] = Dim{ static_cast(intData[i])}; + } + else { + // this should not happen + throw std::runtime_error("TMVA SOFIE Concat Operator- invalid input type for shape output type"); + } + std::copy(inputData.begin(), inputData.end(), outputData.begin() + offset ); + offset += inputLength; + } + // add output tensor + model.AddShapeTensor(fOutput,outputData, false); // cannot be a scalar + if (model.Verbose()) { + std::cout << "output of Concat is a shape tensor " << ConvertShapeToString(outputShape) << " : " + << ConvertDimShapeToString(outputData) << " (shape)" << std::endl; + } + fIsOutputConstant = true; } } if (!fIsOutputConstant) { model.AddIntermediateTensor(fOutput, model.GetTensorType(fInputs[0]), fOutputShape); if (model.Verbose()) { - std::cout << "Concat ---> " << fOutput << " " << ConvertDynamicShapeToString(fOutputShape) << std::endl; + std::cout << "Concat ---> " << fOutput << " " << ConvertDimShapeToString(fOutputShape) << std::endl; } } } - std::string Generate(std::string OpName) override { - if (fIsOutputConstant) return ""; - OpName = "op_"+OpName; + std::string Generate(std::string opName) override { + opName = "op_" + opName; + std::stringstream out; + out<<"\n//--------- Concat " << opName << " --> " << fOutput << " " << ConvertDimShapeToString(fOutputShape) << "\n"; + + if (fIsOutputConstant) return out.str(); + if(fOutputShape.empty()){ throw std::runtime_error("TMVA SOFIE Concat called to Generate without being initialized first"); } - std::stringstream out; - out<<"\n//--------- Concat\n"; // special case when memory is contiguous bool hasShapeOnes = true; for(int i = 0; i 0) out << offset; offset += " + " + length; @@ -238,14 +304,14 @@ for (size_t j = 0; j < fInputs.size(); j++) { if (j>0) - out << SP << SP << SP << "idxOut += " << fInputShapes[j-1][fAxis].GetVal() << ";\n"; + out << SP << SP << SP << "idxOut += " << inStrides[j-1][fAxis-1].GetVal() << ";\n"; out << SP << SP << SP << "int idxIn" << j <<" = "; for (int k = 0; k < fAxis; k++) { if (k > 0) out << " + "; out << inStrides[j][k].GetVal() << "*i" << k; } out << ";\n"; - out << SP << SP << SP << "for (size_t iC = 0; iC < " << fInputShapes[j][fAxis].GetVal() << "; ++iC) {\n"; + out << SP << SP << SP << "for (size_t iC = 0; iC < " << inStrides[j][fAxis-1].GetVal() << "; ++iC) {\n"; out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n"; out << SP << SP << SP << "}\n"; // concatenate the axis values @@ -257,7 +323,131 @@ return out.str(); } - }; + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fOutputShape.empty()) + throw std::runtime_error("TMVA SOFIE Operator Concat called to Generate without being initialized first"); + + const std::size_t D = fOutputShape.size(); + const std::size_t Nin = fInputs.size(); + + auto outStrides = UTILITY::ComputeStrideFromShape(fOutputShape); + + std::vector prefix(Nin); + prefix[0] = 0; + for (std::size_t k = 1; k < Nin; ++k) + prefix[k] = prefix[k - 1] + std::stoul(fInputShapes[k - 1][fAxis].GetVal()); + + std::vector> inStrides(Nin); + for (std::size_t k = 0; k < Nin; ++k) + inStrides[k] = UTILITY::ComputeStrideFromShape(fInputShapes[k]); + + std::string op; + op = "\n//------ CONCAT_KERNEL_ALPAKA\n"; + op += SP + "struct ConcatKernel_" + opName + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "std::array inputs,\n"; + op += SP + SP + SP + "T* output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "std::size_t remaining;\n"; + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + op += SP + SP + SP + SP + "remaining = elem_idx;\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string stride_val = outStrides[d].GetVal(); + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = remaining / " + stride_val + "u;\n"; + op += SP + SP + SP + SP + "remaining -= out_" + std::to_string(d) + + " * " + stride_val + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t chosen = 0;\n"; + for (std::size_t k = 0; k < Nin; ++k) { + std::size_t end_k = prefix[k] + std::stoul(fInputShapes[k][fAxis].GetVal()); + op += SP + SP + SP + SP + "chosen += static_cast(" + + std::to_string(end_k) + "u <= out_" + std::to_string(fAxis) + ");\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const output_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + SP + "out_" + std::to_string(d) + + " * " + outStrides[d].GetVal() + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t k = 0; k < Nin; ++k) { + op += SP + SP + SP + SP + SP + "(chosen == " + std::to_string(k) + "u) * (\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string coord = (d == static_cast(fAxis)) + ? ("(out_" + std::to_string(d) + " - " + std::to_string(prefix[k]) + "u)") + : ("out_" + std::to_string(d)); + op += SP + SP + SP + SP + SP + SP + coord + + " * " + inStrides[k][d].GetVal() + "u"; + op += (d + 1 < D) ? " +\n" : "\n"; + } + op += SP + SP + SP + SP + SP + ")"; + op += (k + 1 < Nin) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[output_idx] = inputs[chosen][input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + return SP + "ConcatKernel_" + opName + " concatKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fOutputShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Concat called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertDimShapeToLength(fOutputShape); + out << "\n//------ CONCAT_GPU_ALPAKA\n"; + switch (fInputType){ + case ETensorType::FLOAT: + out << SP << "std::array input_ptrs_" << OpName << " = {"; break; + case ETensorType::INT64: + out << SP << "std::array input_ptrs_" << OpName << " = {"; break; + default: + throw std::runtime_error("Data type for Concat operator is not yet supported."); + } + for(size_t i=0; i0) out << ", "; + out << "alpaka::getPtrNative(deviceBuf_" << fInputs[i] << ")"; + } + out << "};\n"; + + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"< const kernelCfg_" << OpName << " = {elementsPerGrid_" << OpName << ", elementsPerThread_" << OpName << "};\n"; + out << SP << "auto const workDiv_" << OpName << " = alpaka::getValidWorkDiv(kernelCfg_" << OpName << ", devAcc, concatKernel_" << OpName << ", input_ptrs_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fOutput << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << OpName + << ", concatKernel_" << OpName << ", input_ptrs_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fOutput << "), static_cast(" << length << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + return out.str(); + } + + }; }//SOFIE + #endif //SOFIE_ROPERATOR_CONCAT diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx index 0d08432..6590909 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx @@ -101,6 +101,11 @@ public: // no code to generate here. Tensor are defined in Session constructor return "//---------------------------------------\n"; } + + std::string Generate_GPU_ALPAKA(std::string /* OpName */) override { + // no code to generate here. Tensor are defined in Session constructor + return "//---------------------------------------\n"; + } }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx index 0467385..b9d917b 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx @@ -1,9 +1,9 @@ #ifndef SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX #define SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX -#include -#include -#include +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" #include #include diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx index e9b555b..901bff8 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx @@ -41,6 +41,7 @@ public: ROperator_Einsum(const std::string & equation, const std::vector & namesX, const std::string & nameY): fNInputs(namesX.size()), fNY(UTILITY::Clean_name(nameY)) { + fKind = OperatorKind::EINSUM; for (size_t i = 0; i < namesX.size(); i++) fNInputs[i] = UTILITY::Clean_name(namesX[i]); diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx index 34e18a6..dcbfd68 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx @@ -27,6 +27,7 @@ public: ROperator_Elu(float alpha,std::string nameX, std::string nameY): falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) { + fKind = OperatorKind::ELU; fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx index c834a06..786556d 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx @@ -14,9 +14,10 @@ template class ROperator_Expand final : public ROperator{ private: - std::vector fShapeX; + std::vector fShapeX; std::vector fShape; - std::vector fShapeY; + std::vector fShapeY; + std::vector fShapeDim; std::string fNX; std::string fNShape; @@ -24,6 +25,8 @@ private: std::string fType; bool fInitialized = false; + bool fInitializedShape = false; + bool fInitBroadcast = false; public: ROperator_Expand(){} @@ -33,97 +36,318 @@ public: fOutputTensorNames = { fNY }; } - // type of output given input - std::vector TypeInference(std::vector input) override { - return input; - } - - std::vector> ShapeInference(std::vector> input) override { - return input; - } void Initialize(RModel& model) override { // input must be a graph input, or already initialized intermediate tensor if (!model.CheckIfTensorAlreadyExist(fNX)) { throw std::runtime_error("TMVA SOFIE Expand Op Input Tensor " + fNX + " is not found in model"); } - fShapeX = model.GetTensorShape(fNX); - if (!model.IsInitializedTensor(fNShape)) { - throw std::runtime_error("TMVA::SOFIE - Tensor " + fNShape + " is not initialized."); - } - int64_t *shapeData = + fShapeX = model.GetDimTensorShape(fNX); + if (model.IsInitializedTensor(fNShape)) { + fInitializedShape = true; + int64_t *shapeData = static_cast(model.GetInitializedTensorData(fNShape).get()); - fShape = model.GetTensorShape(fNShape); - if (fShape.size() != 1) { - throw std::runtime_error("TMVA::SOFIE - Expand operator shape must be a 1d tensor."); + fShape = model.GetTensorShape(fNShape); + if (fShape.size() != 1) { + throw std::runtime_error("TMVA::SOFIE - Expand operator shape must be a 1d tensor."); + } + size_t N = fShape[0]; + // what do we do if shapeData contains negative values? + for (size_t i = 0; i < N; i++) { + if ( shapeData[i] < 0) + throw std::runtime_error("TMVA::SOFIE - Expand: invalid shape value " + std::to_string(shapeData[i])); + } + std::vector shape(shapeData, shapeData + N); + fShapeDim = ConvertShapeToDim(shape); + } else if (model.IsShapeTensor(fNShape)) { + // case input shape is a shape tensor + fShapeDim = model.GetShapeTensorValues(fNShape); + fInitializedShape = true; + } else { + // assume shape of input shape is known (size is 1) + auto shapeOfInputShape = model.GetTensorShape(fNShape); + fShapeDim.resize(shapeOfInputShape[0]); + for (size_t i = 0; i < fShapeDim.size(); i++) { + fShapeDim[i] = Dim{std::string("v_") + fNShape + "_" + std::to_string(i)}; + model.AddShapeParam(fShapeDim[i].param); + } } - size_t N = fShape[0]; - std::vector shape(shapeData, shapeData + N); // Y is the common shape of fShapeX and shape - fShapeY = SOFIE::UTILITY::UnidirectionalBroadcastShape( - fShapeX, shape); - fInitialized = model.IsInitializedTensor(fNX); - // Broadcast X to the common shape fShapeY - bool broadcast = !UTILITY::AreSameShape(fShapeX, fShapeY); - if (model.IsInitializedTensor(fNX)) { + auto ret = SOFIE::UTILITY::MultidirectionalBroadcastShape(fShapeX, fShapeDim); + fShapeY = ret.second; + fInitialized = model.IsInitializedTensor(fNX) && fInitializedShape; + std::vector shapeX; + std::vector shapeY; + // case shape tensor and input shape are known + if (!model.IsDynamicTensor(fNX) && !model.IsDimInputTensor(fNX) && fInitializedShape) { + shapeX = ConvertShapeToInt(fShapeX); + shapeY = ConvertShapeToInt(fShapeY); + if (!UTILITY::AreSameShape(shapeX, shapeY)) + fInitBroadcast = true; + } + if (fInitialized) { + // cannot have Dim initialized tensors + assert(!shapeX.empty() && !shapeY.empty()); + // Broadcast X to the common shape shapeY // If X is an initialized tensor (constant) auto data = model.GetInitializedTensorData(fNX); - if (broadcast) { + if (fInitBroadcast) { std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeX, fShapeY), + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), shapeX, shapeY), std::default_delete()); // Update the data and the shape of X - model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), fShapeY, broadcastedData); + model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), shapeY, broadcastedData); fShapeX = fShapeY; // need to set as a not writable tensor model.SetNotWritableInitializedTensor(fNX); data = broadcastedData; } - if (broadcast || model.IsConstantTensor(fNX)) { + if (fInitBroadcast || model.IsConstantTensor(fNX)) { fIsOutputConstant = true; // constant output in this case - model.AddConstantTensor(fNY, model.GetTensorType(fNX), fShapeY, data); + model.AddConstantTensor(fNY, model.GetTensorType(fNX), shapeY, data); fOutputTensorNames.pop_back(); } else { - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), shapeY); } } else { - // case input is not initialized - model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); + // // case input is not initialized + // if (shapeX.empty() && shapeDim.empty()) { + + // } + // if (fInitializedShape) + model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); } fType = ConvertTypeToString(model.GetTensorType(fNX)); - if (model.Verbose()) - std::cout << "Expand - output is with shape " << ConvertShapeToString(fShapeY) << std::endl; + if (model.Verbose()) { + std::cout << "Expand - input " << fNX << " shape " << ConvertDimShapeToString(fShapeX) << " --> " << fNY << " shape " + << ConvertDimShapeToString(fShapeY) << (fIsOutputConstant ? ConvertValuesToString(model.GetTensorData(fNY)) + " (constant)" : "") << std::endl; + } + + if (fInitializedShape && model.IsInitializedTensor(fNShape)) { + // Shape values are fully consumed into fShapeY/fShapeDim at generation time — + // no device buffer needed for fNShape for Heterogeneous inference + model.SetNotWritableInitializedTensor(fNShape); + } } std::string GenerateInitCode() override { std::stringstream out; - if (!fIsOutputConstant && (fInitialized || fShapeX == fShapeY ) ) { - size_t length = ConvertShapeToLength(fShapeY); + if (!fIsOutputConstant && fInitialized && !fInitBroadcast) { + // shapeX and shapeY are the same in this case + auto length = ConvertDimShapeToLength(fShapeY); out << "// Copying initialized tensor " << fNX << " to " << fNY << "\n"; out << SP << "std::copy(tensor_" << fNX << ", " << "tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n"; } return out.str(); } - std::string Generate(std::string OpName) override { + std::string Generate(std::string opName) override { if (fIsOutputConstant) return ""; - OpName = "op_" + OpName; + opName = "op_" + opName; if (fShapeY.empty()) { throw std::runtime_error("TMVA SOFIE Expand Op called to Generate without being initialized first"); } std::stringstream out; - out << SP << "\n//------ Expand Op" << "\n"; + out << SP << "\n//------ Expand " << opName << " --> " << ConvertDimShapeToString(fShapeY) << "\n"; + // need to declare shape parameters for non initialized shapes + if (!fInitializedShape) { + for (size_t i = 0; i < fShapeDim.size(); i++) { + out << SP << "size_t " << fShapeDim[i] << " = " << "tensor_" << fNShape << "[" << i << "];\n"; + } + } // No need to broadcast A if it's an initialized tensor or shapes are the same if (!fInitialized && fShapeX != fShapeY) { out << SP << "// Broadcasting uninitialized tensor " << fNX << "\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" << fNX << ", " << ConvertShapeToString(fShapeX) << ", " << ConvertShapeToString(fShapeY) - << ", std::span<"<(tensor_"<& shape) { + return std::all_of(shape.begin(), shape.end(), + [](const Dim& d){ return !d.isParam; }); + }; + if (!isStatic(fShapeX) || !isStatic(fShapeY)) return ""; + + // Check if broadcast is actually needed + bool needsBroadcast = (fShapeX.size() != fShapeY.size()); + if (!needsBroadcast) { + needsBroadcast = std::any_of(fShapeX.begin(), fShapeX.end(), + [&](const Dim& d) { + size_t i = &d - fShapeX.data(); + return fShapeX[i].dim != fShapeY[i].dim; + }); + } + if (!needsBroadcast) return ""; // same static shape — just a memcpy + + const std::size_t D = fShapeY.size(); + + // Left-pad fShapeX with dim=1 entries to match rank of fShapeY + std::vector shapeX_padded(D, 1); + size_t offset = D - fShapeX.size(); + for (size_t i = 0; i < fShapeX.size(); ++i) + shapeX_padded[offset + i] = fShapeX[i].dim; + + std::vector shapeY_int(D); + for (size_t i = 0; i < D; ++i) + shapeY_int[i] = fShapeY[i].dim; + auto stridesX = UTILITY::ComputeStrideFromShape(shapeX_padded); + auto stridesY = UTILITY::ComputeStrideFromShape(shapeY_int); + std::size_t totalElements = ConvertShapeToLength(shapeY_int); + + std::string kname = "ExpandKernel_" + opName; + + std::string op; + op = "\n//------ EXPAND_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + // Decompose output linear index using compile-time output strides + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % " + + std::to_string(shapeY_int[d]) + "u;\n"; + } + op += "\n"; + + // Input index: broadcast dims (shapeX_padded[d]==1) contribute 0 — + // compiler eliminates zero terms entirely, no runtime branch + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + if (shapeX_padded[d] == 1) { + op += SP + SP + SP + SP + SP + "0u"; + } else { + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(d) + + " * " + std::to_string(stridesX[d]) + "u"; + } + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; // end grid-stride loop + op += SP + SP + "}\n"; // end operator() + op += SP + "};\n"; // end struct + + return op; +} + +std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + if (fInitialized) return ""; + + auto isStatic = [](const std::vector& shape) { + return std::all_of(shape.begin(), shape.end(), + [](const Dim& d){ return !d.isParam; }); + }; + if (!isStatic(fShapeX) || !isStatic(fShapeY)) return ""; + + // Check if broadcast is actually needed + bool needsBroadcast = (fShapeX.size() != fShapeY.size()); + if (!needsBroadcast) { + for (size_t i = 0; i < fShapeX.size(); ++i) + if (fShapeX[i].dim != fShapeY[i].dim) { needsBroadcast = true; break; } + } + if (!needsBroadcast) return ""; + + opName = "op_" + opName; + std::string kname = "ExpandKernel_" + opName; + return SP + kname + " expandKernel_" + opName + ";\n"; +} + +std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("TMVA SOFIE Operator Expand called to Generate without being initialized first"); + + std::stringstream out; + out << "\n//------ EXPAND_GPU_ALPAKA\n"; + + if (fInitialized && !fInitBroadcast) { + // GenerateInitCode already handled the copy — nothing to do at inference time + return ""; + } + + auto isStatic = [](const std::vector& shape) { + return std::all_of(shape.begin(), shape.end(), + [](const Dim& d){ return !d.isParam; }); + }; + bool staticShapes = isStatic(fShapeX) && isStatic(fShapeY); + + // Check if broadcast is actually needed for static shapes + bool needsBroadcast = !staticShapes; // dynamic always needs runtime broadcast + if (staticShapes) { + needsBroadcast = (fShapeX.size() != fShapeY.size()); + if (!needsBroadcast) { + for (size_t i = 0; i < fShapeX.size(); ++i) + if (fShapeX[i].dim != fShapeY[i].dim) { needsBroadcast = true; break; } + } + } + + if (!needsBroadcast) { + // Same static shape — device-to-device copy + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY + << ", deviceBuf_" << fNX << ");\n"; + out << SP << "alpaka::wait(queue);\n"; + return out.str(); + } + + if (!staticShapes) { + // Dynamic shapes — not yet supported on GPU, throw a clear error + throw std::runtime_error( + "TMVA SOFIE Expand GPU: dynamic shapes are not yet supported for GPU inference. " + "Tensor " + fNX + " has a dynamic shape."); + } + + // Static broadcast — launch the expand kernel + std::vector shapeY_int(fShapeY.size()); + for (size_t i = 0; i < fShapeY.size(); ++i) + shapeY_int[i] = fShapeY[i].dim; + std::size_t totalElements = ConvertShapeToLength(shapeY_int); + std::string kname = "expandKernel_" + opName; + + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "alpaka::KernelCfg const kernelCfg_" << opName + << " = {elementsPerGrid_" << opName << ", elementsPerThread_" << opName << "};\n"; + out << SP << "auto const workDiv_" << opName << " = alpaka::getValidWorkDiv(kernelCfg_" << opName + << ", devAcc, " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + + return out.str(); +} +}; }//SOFIE #endif //SOFIE_ROperator_Expand diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx index bb1a74e..5b553ff 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx @@ -11,7 +11,6 @@ #include #include - namespace SOFIE { /*! \brief Gated Recurrent Unit operator @@ -91,7 +90,7 @@ template class ROperator_GRU final : public ROperator { fNSequence_lens(UTILITY::Clean_name(nameSequence_lens)), fNInitial_h(UTILITY::Clean_name(nameInitial_h)), fNY(UTILITY::Clean_name(nameY)), fNY_h(UTILITY::Clean_name(nameY_h)) { - + fInputTensorNames = { fNX, fNW, fNR }; if (!fNB.empty()){ fInputTensorNames.emplace_back(fNB); @@ -123,39 +122,34 @@ template class ROperator_GRU final : public ROperator { * * \param input type of the input tensors */ - std::vector TypeInference(std::vector /*input*/); + std::vector TypeInference(std::vector /*input*/) override; /*! \brief Infers the shape of the output tensors * * \param input shape of the input tensors */ - std::vector> ShapeInference(std::vector> /*input*/); + std::vector> ShapeInference(std::vector> /*input*/) override; /*! \brief Initialize the model * * \param model Model */ - void Initialize(RModel &); + void Initialize(RModel &) override; /*! \brief Generate the inference code * * \param OpName name of the operator */ - std::string Generate(std::string /*OpName*/); - - /*! \brief Generate the code for the Session internal data vectors - * - * \param opName name of the operator - */ - std::string GenerateSessionMembersCode(std::string opName); + std::string Generate(std::string /*OpName*/) override; /*! \brief Returns the blas routines needed to compile the generated code */ - std::vector GetBlasRoutines() { return { std::string("Gemm"), std::string("Axpy") }; } + std::vector GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; } }; } // namespace SOFIE + // Implementation of the ROperator_GRU class #include "SOFIE/ROperator_GRU.icc" diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc index f3813c2..38030d1 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc +++ b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc @@ -175,51 +175,45 @@ void ROperator_GRU::Initialize(RModel& model){ fAttrActivations = {"Sigmoid", "Tanh"}; } } -} -// generate code for Session data members (e.g. internal vectors) -template -std::string ROperator_GRU::GenerateSessionMembersCode(std::string opName) -{ - opName = "op_" + opName; - std::stringstream out; + // To get unique intermediate tensor names, we add the name of the input + // tensor. One might also consider using the index of the operator in the + // RMode, but this information is not available in the current scope. + std::string opName = "op_gru_" + fNX; size_t num_directions = fShapeW[0]; size_t seq_length = (fAttrLayout == 0) ? fShapeX[0] : fShapeX[1]; size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0]; size_t input_size = fShapeX[2]; + auto declareVector = [&](std::string const &name, std::size_t n){ + std::string fullName = opName + "_" + name; + model.AddIntermediateTensor(fullName, ConvertStringToType(fType), std::vector{n}); + }; + if (fAttrLayout != 0) { - out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">(" - << seq_length * batch_size * input_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">(" - << num_directions * batch_size * fAttrHiddenSize << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_initial_cell_state = std::vector<" << fType << ">(" - << num_directions * batch_size * fAttrHiddenSize << ");\n"; + declareVector("input", seq_length * batch_size * input_size); + declareVector("initial_hidden_state", num_directions * batch_size * fAttrHiddenSize); + declareVector("initial_cell_state", num_directions * batch_size * fAttrHiddenSize); } // Set the feedforward size_t ff_size = seq_length * batch_size * fAttrHiddenSize; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_update_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_reset_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_f_hidden_gate = std::vector<" << fType << ">(" << ff_size << ");\n"; + declareVector("f_update_gate", ff_size); + declareVector("f_reset_gate", ff_size); + declareVector("f_hidden_gate", ff_size); // gate results size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize; - out << "std::vector<" << fType << "> fVec_" << opName << "_update_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_reset_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; - out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_gate = std::vector<" << fType << ">(" << hs_size << ");\n"; + declareVector("update_gate", hs_size); + declareVector("reset_gate", hs_size); + declareVector("hidden_gate", hs_size); // feedback - out << "std::vector<" << fType << "> fVec_" << opName << "_feedback = std::vector<" << fType << ">(" - << batch_size * fAttrHiddenSize << ");\n"; + declareVector("feedback", batch_size * fAttrHiddenSize); // hiddden state if (fAttrLayout != 0 || fNY.empty()) { - out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">(" << hs_size << ");\n"; + declareVector("hidden_state", hs_size); } - - out << "\n"; - - return out.str(); } @@ -234,12 +228,14 @@ auto ROperator_GRU::Generate(std::string OpName) size_t input_size = fShapeX[2]; size_t num_directions = fShapeW[0]; + auto getVec = [&](std::string const &name) { return "tensor_op_gru_" + fNX + "_" + name; }; + // set the input if (fAttrLayout == 0) { - out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << fType << " const* " << OpName << "_input = tensor_" << fNX << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n"; + out << SP << fType << " * " << OpName << "_input = " << getVec("input") << ";\n"; } else { out << SP << fType << " " << OpName << "_input[" << seq_length * batch_size * input_size << "];\n"; } @@ -261,8 +257,7 @@ auto ROperator_GRU::Generate(std::string OpName) << fNInitial_h << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_initial_hidden_state = fVec_" << OpName - << "_initial_hidden_state.data();\n"; + out << SP << fType << " * " << OpName << "_initial_hidden_state = " << getVec("initial_hidden_state") << ";\n"; } else { out << SP << fType << " " << OpName << "_initial_hidden_state[" << num_directions * batch_size * fAttrHiddenSize << "];\n"; @@ -283,9 +278,9 @@ auto ROperator_GRU::Generate(std::string OpName) // Set the feedforward size_t feedforward_size = seq_length * batch_size * fAttrHiddenSize; if (fUseSession) { - out << SP << fType << " * " << OpName << "_f_update_gate = fVec_" << OpName << "_f_update_gate.data();\n"; - out << SP << fType << " * " << OpName << "_f_reset_gate = fVec_" << OpName << "_f_reset_gate.data();\n"; - out << SP << fType << " * " << OpName << "_f_hidden_gate = fVec_" << OpName << "_f_hidden_gate.data();\n"; + out << SP << fType << " * " << OpName << "_f_update_gate = " << getVec("f_update_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_f_reset_gate = " << getVec("f_reset_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_f_hidden_gate = " << getVec("f_hidden_gate") << ";\n"; } else { out << SP << fType << " " << OpName << "_f_update_gate[" << feedforward_size << "] = {0};\n"; out << SP << fType << " " << OpName << "_f_reset_gate[" << feedforward_size << "] = {0};\n"; @@ -294,9 +289,9 @@ auto ROperator_GRU::Generate(std::string OpName) // Set the gates size_t hidden_state_size = seq_length * num_directions * batch_size * fAttrHiddenSize; if (fUseSession) { - out << SP << fType << " * " << OpName << "_update_gate = fVec_" << OpName << "_update_gate.data();\n"; - out << SP << fType << " * " << OpName << "_reset_gate = fVec_" << OpName << "_reset_gate.data();\n"; - out << SP << fType << " * " << OpName << "_hidden_gate = fVec_" << OpName << "_hidden_gate.data();\n"; + out << SP << fType << " * " << OpName << "_update_gate = " << getVec("update_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_reset_gate = " << getVec("reset_gate") << ";\n"; + out << SP << fType << " * " << OpName << "_hidden_gate = " << getVec("hidden_gate") << ";\n"; } else { out << SP << fType << " " << OpName << "_update_gate[" << hidden_state_size << "] = {0};\n"; out << SP << fType << " " << OpName << "_reset_gate[" << hidden_state_size << "] = {0};\n"; @@ -307,14 +302,14 @@ auto ROperator_GRU::Generate(std::string OpName) out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n"; } else { if (fUseSession) { - out << SP << fType << " * " << OpName << "_hidden_state = fVec_" << OpName << "_hidden_state.data();\n"; + out << SP << fType << " * " << OpName << "_hidden_state = " << getVec("hidden_state") << ";\n"; } else { out << SP << fType << " " << OpName << "_hidden_state[" << hidden_state_size << "] = {0};\n"; } } if (fUseSession) { - out << SP << fType << " * " << OpName << "_feedback = fVec_" << OpName << "_feedback.data();\n"; + out << SP << fType << " * " << OpName << "_feedback = " << getVec("feedback") << ";\n"; } else { out << SP << fType << " " << OpName << "_feedback[" << batch_size * fAttrHiddenSize << "] = {0};\n"; } diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx index 4d34846..a56b012 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx @@ -22,9 +22,9 @@ private: std::string fNIndices; std::string fNY; - std::vector fShapeX; - std::vector fShapeIndices; - std::vector fShapeY; + std::vector fShapeX; + std::vector fShapeIndices; + std::vector fShapeY; std::vector fIndices; // indices vector in case they are known at initialization @@ -51,8 +51,12 @@ public: if (!model.CheckIfTensorAlreadyExist(fNX)) { throw std::runtime_error("TMVA SOFIE Gather Op Input Tensor " + fNX + " is not found in model"); } - fShapeX = model.GetTensorShape(fNX); - fShapeIndices = model.GetTensorShape(fNIndices); + fShapeX = model.GetDimTensorShape(fNX); + if (model.Verbose()) + std::cout << "Gather - initial shape " << ConvertDimShapeToString(fShapeX) << " shape of indices " + << ConvertDimShapeToString(model.GetDimTensorShape(fNIndices)) << std::endl; + // fShapeIndices can be dynamic + fShapeIndices = model.GetDimTensorShape(fNIndices); size_t q = fShapeIndices.size(); // Axis in range [0, r) where r=rank(X) size_t r = fShapeX.size(); @@ -60,18 +64,20 @@ public: if (fAttrAxis < 0) { fAttrAxis = fAttrAxis + int64_t(r); } - // empty fShapeIndices is a scalar value for the indices - size_t indicesLength = ConvertShapeToLength(fShapeIndices); + // case indices tensor is initialized if (model.IsInitializedTensor(fNIndices)) { + // empty shape Indices is a scalar value for the indices + size_t indicesLength = ConvertShapeToLength(model.GetTensorShape(fNIndices)); int64_t* indicesData = static_cast(model.GetInitializedTensorData(fNIndices).get()); - //flag index tensor as not writable (not sure this is needed since index tensor might be used in generated code) - model.SetNotWritableInitializedTensor(fNIndices); // update indices data in case of negative dim values for (size_t i = 0; i < indicesLength; i++) { - if (indicesData[i] < 0) { - indicesData[i] += fShapeX[fAttrAxis]; + // move this at generation time? + if (!fShapeX[fAttrAxis].isParam) { + if (indicesData[i] < 0) { + indicesData[i] += fShapeX[fAttrAxis].dim; + } } } // Save in a vector gather Indices of size q @@ -79,65 +85,91 @@ public: } // Output shape if (model.Verbose()) - std::cout << "Gather: q and r " << q << " " << r << " shape indices " << ConvertShapeToString(fShapeIndices) << std::endl; + std::cout << "Gather: q and r " << q << " " << r << " shape indices " << ConvertDimShapeToString(fShapeIndices) << std::endl; if (fShapeY.empty()) { fShapeY.resize(q + r - 1); if (fAttrAxis > 0) { - // Copy shape of X[0, ..., axis) to Shape of Y[0, ..., axis) + // Copy shape of X[0, ..., axis-1) to Shape of Y[0, ..., axis-1) std::copy(fShapeX.begin(), fShapeX.begin() + fAttrAxis, fShapeY.begin()); } // Set shape of Y[axis, ..., axis + q) for (size_t i = 0; i < q; i++) { - fShapeY[fAttrAxis + i] = fShapeIndices[i]; + fShapeY[fAttrAxis + i] = Dim{ fShapeIndices[i]}; } - // Copy shape of X[axis + 1, ..., axis + r) to shape of Y[axis + q, ... q + r - 1) + // Copy shape of X[axis + 1, ..., r) to shape of Y[axis + q, ... q + r - 1) std::copy(fShapeX.begin() + fAttrAxis + 1, fShapeX.end(), fShapeY.begin() + fAttrAxis + q); } // case input is known (type is an integer) and input indices is a scalar (or vector of size 1) if (model.IsInitializedTensor(fNX) && q <= 1 && r == 1 && fIndices.size() > 0) { + auto shapeX = ConvertShapeToInt(fShapeX); // we assume model is not dynamic + auto shapeY = ConvertShapeToInt(fShapeY); if (model.GetTensorType(fNX) == ETensorType::INT64) { auto inputData = static_cast(model.GetInitializedTensorData(fNX).get()); // if q <=1 and r = 1 output length = 1 (it is a scalar) - std::vector outputData(ConvertShapeToLength(fShapeY)); + std::vector outputData(1); //ConvertShapeToLength(shapeY)); outputData[0] = inputData[fIndices[0]]; - model.AddConstantTensor(fNY, fShapeY, outputData.data()); + model.AddConstantTensor(fNY, shapeY, outputData.data()); if (model.Verbose()) - std::cout << "Gather: " << fNX << " " << ConvertShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) + std::cout << "Gather: " << fNX << " " << ConvertShapeToString(shapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(shapeY) << " and values " << ConvertValuesToString(outputData) << " (constant) " << std::endl; fIsOutputConstant = true; } } + // case input is a shape tensor (r is == 1 by definition) and indices are known + else if (model.IsShapeTensor(fNX) && q <=1 && fIndices.size() > 0) { + auto inputData = model.GetShapeTensorValues(fNX); + // if r == 1 and q<=1 then output length is 1 (is a scalar or tensor of size1) + std::vector outputData(1); + outputData[0] = inputData[fIndices[0]]; + if (outputData[0].isParam) { + fIsOutputConstant = true; + // shapeY can be scalar or vector of size1 + model.AddShapeTensor(fNY, outputData, fShapeY.size() == 0); + if (model.Verbose()) + std::cout << "Gather: " << fNX << " " << ConvertDimShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) + << " and values " << ConvertDimShapeToString(outputData) << " (shape) " << std::endl; + } else { + int64_t value = static_cast(outputData[0].dim); + auto shapeY = ConvertShapeToInt(fShapeY); + model.AddConstantTensor(fNY, shapeY, &value); + fIsOutputConstant = true; + if (model.Verbose()) + std::cout << "Gather: " << fNX << " " << ConvertDimShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) + << " and values {" << value << "} (constant) " << std::endl; + } + } if (!fIsOutputConstant) { // Add output tensor model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); fType = ConvertTypeToString(model.GetTensorType(fNX)); if (model.Verbose()) - std::cout << "Gather: " << fNX << " " << ConvertShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) - << std::endl; + std::cout << "Gather: input " << fNX << " " << ConvertDimShapeToString(fShapeX) << " indices " << fNIndices << ConvertDimShapeToString(fShapeIndices) + << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) << std::endl; } } - std::string Generate(std::string OpName) override { + std::string Generate(std::string opName) override { + opName = "op_" + opName; + std::stringstream out; + out << "//--------- Gather " << opName << " --> " << fNY << " " << ConvertDimShapeToString(fShapeY) << "\n"; if (fIsOutputConstant) { // no code to generate here for constant output. Tensor output is defined in Session constructor - return "//---------------------------------------\n"; + out << "//--------------------(constant)----------\n"; + return out.str(); } - OpName = "op_" + OpName; - std::stringstream out; - out << "//--------- Gather operator \n"; // The shape of the output is q + r - 1 size_t r = fShapeX.size(); // Indices of shape q size_t q = fShapeIndices.size(); // Strides - std::vector stridesX = UTILITY::ComputeStrideFromShape(fShapeX); - std::vector stridesY = UTILITY::ComputeStrideFromShape(fShapeY); - std::vector stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); + auto stridesX = UTILITY::ComputeStrideFromShape(fShapeX); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); // case fIndices is not known we need to correct for negative axis indices at run-time if (fIndices.empty()) { - size_t indicesLength = ConvertShapeToLength(fShapeIndices); + auto indicesLength = ConvertDimShapeToLength(fShapeIndices); out << SP << "// correct in case of negative gather indices\n"; out << SP << "for (size_t i = 0; i < " << indicesLength << "; i++){\n"; out << SP << SP << "if (tensor_" << fNIndices << "[i] < 0)\n"; @@ -145,73 +177,230 @@ public: out << SP << "}\n"; } - // Fill the output Y[j_0, j_1, ..., j_{axis - 1}, i_0, i_1, ..., i_{q - 1}, j_{axis + 1}, ..., j_{r - 1}] // [0 ... axis) [axis ... axis + q) [axis + q ... q + r - 1) // iterate in [0 ... axis) [0 ... q) [axis ... r - 1) // for j_0, j_1, ..., j_{axis-1} + for (size_t j = 0; j < size_t(fAttrAxis); j++) { std::string index = "j_" + std::to_string(j); - out << SP << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[j] << "; " << index << "++) {\n"; + for (size_t k = 0; k <= j; k++) out << SP; + out << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[j] << "; " << index << "++) {\n"; } // for i_0, i_1, ..., i_{q - 1} - if (q == 0) - out << SP << SP << "{\n"; // add a scope for local variables for (size_t i = 0; i < q; i++) { std::string index = "i_" + std::to_string(i); - out << SP << SP << "for (size_t " << index << " = " << 0 << "; " << index << " < " << fShapeIndices[i] << "; " << index << "++) {\n"; + for (size_t k = 0; k <= i + fAttrAxis; k++) out << SP; + out << "for (size_t " << index << " = " << 0 << "; " << index << " < " << fShapeIndices[i] << "; " << index << "++) {\n"; } // for j_axis, j_{axis + 1}, ..., j_{r - 1} for (size_t j = fAttrAxis; j + 1 < r; j++) { - std::string index = "j_" + std::to_string(j); - out << SP << SP << SP << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[q + j] << "; " << index << "++) {\n"; + std::string index = "j_" + std::to_string(q+j); // annotate index using output axis + for (size_t k = 0; k <= q + j; k++) out << SP; + out << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[q + j] << "; " << index << "++) {\n"; } - out << SP << SP << SP << "size_t y_index = 0;\n"; + // add a scope for local variables in case above loop are not done + if (fAttrAxis == 0 && q == 0 && r <= 1) + out << SP << "{ // scalar case \n"; + + // output index + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t y_index = "; for (size_t j = 0; j < size_t(fAttrAxis); j++) { - out << SP << SP << SP << "y_index += j_" + std::to_string(j) + " * " << stridesY[j] << ";\n"; + if (j > 0) out << " + "; + out << "j_" << j; + if (stridesY[j].dim != 1) out << " * " << stridesY[j]; } for (size_t i = 0; i < q; i++) { - out << SP << SP << SP << "y_index += i_" + std::to_string(i) + " * " << stridesY[fAttrAxis + i] << ";\n"; + if (fAttrAxis + i > 0) out << " + "; + out << "i_" << i; + if (stridesY[fAttrAxis + i].dim != 1) out << " * " << stridesY[fAttrAxis + i]; } for (size_t j = fAttrAxis; j + 1 < r; j++) { - out << SP << SP << SP << "y_index += j_" + std::to_string(j) + " * " << stridesY[q + j] << ";\n"; + if (j + q > 0) out << " + "; + out << "j_" << q+j; + if (stridesY[q+j].dim != 1) out << " * " << stridesY[q+j]; } - // Indices - out << SP << SP << SP << "size_t i_index = 0;\n"; + // empty case + if (fAttrAxis == 0 && q == 0 && r <= 1) + out << "0"; + out << ";\n"; + + // input Indices + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t i_index = "; for (size_t i = 0; i < q; i++) { - out << SP << SP << SP << "i_index += i_" + std::to_string(i) + " * " << stridesIndices[i] << ";\n"; + if (i > 0) out << " + "; + out << "i_" << i; + if (stridesIndices[i].dim != 1) out << " * " << stridesIndices[i]; } + // empty case + if (q == 0) + out << "0"; + out << ";\n"; + // K - out << SP << SP << SP << "size_t k = static_cast(" << "tensor_" << fNIndices << "[i_index]" << ");\n"; + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t k = static_cast(" << "tensor_" << fNIndices << "[i_index]" << ");\n"; // Input - out << SP << SP << SP << "size_t x_index = k * " << stridesX[fAttrAxis] << ";\n"; + for (size_t k = 0; k < q + r; k++) out << SP; + out << "size_t x_index = k"; + if (stridesX[fAttrAxis].dim != 1) out << " * " << stridesX[fAttrAxis]; for (size_t j = 0; j < size_t(fAttrAxis); j++) { - out << SP << SP << SP << "x_index += j_" + std::to_string(j) + " * " << stridesX[j] << ";\n"; + out << " + "; + out << " j_" << j; + if (stridesX[j].dim != 1) out << " * " << stridesX[j]; } - for (size_t j = fAttrAxis + 1; j < r; j++) { - out << SP << SP << SP << "x_index += j_" + std::to_string(j - 1) + " * " << stridesX[j] << ";\n"; + // for input corresponding stride is axis+1,.... r + // loop is on j from fAttrAxis, so consider stridesX[j+1] + for (size_t j = fAttrAxis; j+1 < r; j++) { + out << " + "; + out << " j_" << q+j; + if (stridesX[j+1].dim != 1) out << " * " << stridesX[j+1]; } - out << SP << SP << SP << "tensor_" << fNY << "[y_index] = tensor_" << fNX << "[x_index];\n"; + out << ";\n"; + for (size_t k = 0; k < q + r; k++) out << SP; + out << "tensor_" << fNY << "[y_index] = tensor_" << fNX << "[x_index];\n"; // end loops j_k, j_{k + 1}, ..., j_{r - 2} - for (size_t j = fAttrAxis; j + 1 < r; j++) { - out << SP << SP << SP << "}\n"; - } - // end loops i_0, i_1, ..., i_{q - 1} - if (q == 0) - out << SP << SP << "}\n"; // end of scope for q = 0 - for (size_t i = 0; i < q; i++) { - out << SP << SP << "}\n"; - } - // end loops j_0, j_1, ..., j_{axis - 1} - for (size_t j = 0; j < size_t(fAttrAxis); j++) { - out << SP << "}\n"; + for (size_t j = q+r-1; j > 0; j--) { + for (size_t k = 0; k \n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "int64_t const* __restrict__ indices,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + stridesY[d].GetVal() + "u) % " + + fShapeY[d].GetVal() + "u;\n"; + } + op += "\n"; + + // Output dims [axis ... axis+q) correspond to the indices tensor dims [0 ... q) + // so i_index = sum over i in [0,q): out_{axis+i} * stridesIndices[i] + if (q == 0) { + op += SP + SP + SP + SP + "std::size_t const i_index = 0u;\n"; + } else { + op += SP + SP + SP + SP + "std::size_t const i_index =\n"; + for (std::size_t i = 0; i < q; ++i) { + op += SP + SP + SP + SP + SP + + "out_" + std::to_string(fAttrAxis + i) + + " * " + stridesIndices[i].GetVal() + "u"; + op += (i + 1 < q) ? " +\n" : ";\n"; + } + } + op += "\n"; + + op += SP + SP + SP + SP + "int64_t k = indices[i_index];\n"; + op += SP + SP + SP + SP + "if (k < 0) k += " + fShapeX[fAttrAxis].GetVal() + ";\n"; + op += SP + SP + SP + SP + "if (k < 0) k = 0;\n"; + op += SP + SP + SP + SP + "if (k >= static_cast(" + fShapeX[fAttrAxis].GetVal() + ")) " + + "k = static_cast(" + fShapeX[fAttrAxis].GetVal() + ") - 1;\n\n"; + + // x_index = k * stridesX[axis] + // + sum over j in [0, axis): out_j * stridesX[j] + // + sum over j in [axis+1, r): out_{j-1+q} * stridesX[j] + // (the dims after axis in Y are shifted by q-1 relative to X) + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + op += SP + SP + SP + SP + SP + "static_cast(k) * " + stridesX[fAttrAxis].GetVal() + "u"; + for (std::size_t j = 0; j < static_cast(fAttrAxis); ++j) { + op += " +\n" + SP + SP + SP + SP + SP + + "out_" + std::to_string(j) + " * " + stridesX[j].GetVal() + "u"; + } + for (std::size_t j = fAttrAxis + 1; j < r; ++j) { + // in Y, the coord for X's dim j lives at output dim q + j - 1 + op += " +\n" + SP + SP + SP + SP + SP + + "out_" + std::to_string(q + j - 1) + " * " + stridesX[j].GetVal() + "u"; + } + op += ";\n\n"; + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; +} + +std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + std::string kname = "GatherKernel_" + opName; + return SP + kname + " gatherKernel_" + opName + ";\n"; +} + +std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("TMVA SOFIE Gather Op called to Generate without being initialized first"); + + auto totalElements = ConvertDimShapeToLength(fShapeY); + std::string kname = "gatherKernel_" + opName; + + std::stringstream out; + out << "\n//------ GATHER_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "alpaka::KernelCfg const kernelCfg_" << opName + << " = {elementsPerGrid_" << opName << ", elementsPerThread_" << opName << "};\n"; + out << SP << "auto const workDiv_" << opName << " = alpaka::getValidWorkDiv(kernelCfg_" << opName + << ", devAcc, " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + + return out.str(); +} + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GatherND.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_GatherND.hxx new file mode 100644 index 0000000..3fa45fa --- /dev/null +++ b/src/SOFIE_core/inc/SOFIE/ROperator_GatherND.hxx @@ -0,0 +1,304 @@ +#ifndef SOFIE_ROPERATOR_GATHERND +#define SOFIE_ROPERATOR_GATHERND + +#include "SOFIE/SOFIE_common.hxx" +#include "SOFIE/ROperator.hxx" +#include "SOFIE/RModel.hxx" + +#include +#include +#include +#include + +namespace SOFIE { + +class ROperator_GatherND final : public ROperator +{ +private: + + int64_t fBatchDims = 0; + + std::string fNData; + std::string fNIndices; + std::string fNY; + + std::vector fShapeData; + std::vector fShapeIndices; + std::vector fShapeY; + + std::string fType; + +public: + ROperator_GatherND() {} + ROperator_GatherND(int64_t batchDims, + std::string nameData, + std::string nameIndices, + std::string nameY) + : fBatchDims(batchDims), + fNData(UTILITY::Clean_name(nameData)), + fNIndices(UTILITY::Clean_name(nameIndices)), + fNY(UTILITY::Clean_name(nameY)) + { + fInputTensorNames = { fNData, fNIndices }; + fOutputTensorNames = { fNY }; + } + + std::vector TypeInference(std::vector input) override { + return { input[0] }; + } + + std::vector> ShapeInference(std::vector> input) override { + return { input[0] }; + } + + void Initialize(RModel& model) override { + if (!model.CheckIfTensorAlreadyExist(fNData)) + throw std::runtime_error("TMVA SOFIE GatherND: data tensor " + fNData + " not found in model"); + if (!model.CheckIfTensorAlreadyExist(fNIndices)) + throw std::runtime_error("TMVA SOFIE GatherND: indices tensor " + fNIndices + " not found in model"); + + fShapeData = model.GetTensorShape(fNData); + fShapeIndices = model.GetTensorShape(fNIndices); + + size_t r = fShapeData.size(); + size_t q = fShapeIndices.size(); + size_t b = static_cast(fBatchDims); + size_t last_idx_dim = fShapeIndices.back(); + + if (r < 1) + throw std::runtime_error("TMVA SOFIE GatherND: data rank must be >= 1"); + if (q < 1) + throw std::runtime_error("TMVA SOFIE GatherND: indices rank must be >= 1"); + if (b >= std::min(q, r)) + throw std::runtime_error("TMVA SOFIE GatherND: batch_dims must be < min(q, r)"); + if (last_idx_dim > r - b) + throw std::runtime_error("TMVA SOFIE GatherND: indices_shape[-1] must be <= r - batch_dims"); + + for (size_t i = 0; i < b; ++i) { + if (fShapeData[i] != fShapeIndices[i]) + throw std::runtime_error("TMVA SOFIE GatherND: first batch_dims dimensions of data and indices must match"); + } + + // Output shape: batch_dims + indices[0..q-2] + data[b + last_idx_dim .. r-1] + // rank = b + (q - b - 1) + (r - b - last_idx_dim) + // = q + r - last_idx_dim - 1 - b + fShapeY.clear(); + for (size_t i = 0; i < b; ++i) + fShapeY.push_back(fShapeData[i]); + for (size_t i = b; i + 1 < q; ++i) + fShapeY.push_back(fShapeIndices[i]); + for (size_t i = b + last_idx_dim; i < r; ++i) + fShapeY.push_back(fShapeData[i]); + + model.AddIntermediateTensor(fNY, model.GetTensorType(fNData), fShapeY); + fType = ConvertTypeToString(model.GetTensorType(fNData)); + + if (model.Verbose()) + std::cout << "GatherND: data " << ConvertShapeToString(fShapeData) + << " indices " << ConvertShapeToString(fShapeIndices) + << " batch_dims=" << fBatchDims + << " -> " << fNY << " " << ConvertShapeToString(fShapeY) << std::endl; + } + + std::string Generate(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("TMVA SOFIE GatherND called to Generate without being initialized first"); + + size_t r = fShapeData.size(); + size_t q = fShapeIndices.size(); + size_t b = static_cast(fBatchDims); + size_t last_idx_dim = fShapeIndices.back(); + + auto stridesData = UTILITY::ComputeStrideFromShape(fShapeData); + auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + size_t totalOutput = ConvertShapeToLength(fShapeY); + + std::stringstream out; + out << SP << "//--------- GatherND operator " << opName << "\n"; + + out << SP << "for (size_t out_idx = 0; out_idx < " << totalOutput << "; out_idx++) {\n"; + + out << SP << SP << "size_t rem = out_idx;\n"; + size_t Dy = fShapeY.size(); + for (size_t d = 0; d < Dy; ++d) { + out << SP << SP << "size_t oy_" << d << " = rem / " << stridesY[d] << ";\n"; + out << SP << SP << "rem %= " << stridesY[d] << ";\n"; + } + + out << SP << SP << "size_t idx_base = 0;\n"; + for (size_t i = 0; i < b; ++i) + out << SP << SP << "idx_base += oy_" << i << " * " << stridesIndices[i] << ";\n"; + for (size_t i = b; i + 1 < q; ++i) + out << SP << SP << "idx_base += oy_" << i << " * " << stridesIndices[i] << ";\n"; + + out << SP << SP << "size_t data_idx = 0;\n"; + for (size_t i = 0; i < b; ++i) + out << SP << SP << "data_idx += oy_" << i << " * " << stridesData[i] << ";\n"; + + out << SP << SP << "for (size_t k = 0; k < " << last_idx_dim << "; k++) {\n"; + out << SP << SP << SP << "int64_t idx_val = tensor_" << fNIndices + << "[idx_base + k * " << stridesIndices[q - 1] << "];\n"; + out << SP << SP << SP << "if (idx_val < 0) idx_val += " << "static_cast(tensor_" + << fNData << "_shape[" << b << " + k]);\n"; + out << SP << SP << SP << "data_idx += static_cast(idx_val) * " << "data_stride_b_plus_k_" << opName << "[k];\n"; + out << SP << SP << "}\n"; + + // Accumulate trailing data dims from output coords + // Y dims [b + (q-b-1) .. ] correspond to data dims [b + last_idx_dim .. r-1] + size_t y_trailing_start = b + (q - b - 1); + for (size_t i = b + last_idx_dim; i < r; ++i) { + size_t oy_dim = y_trailing_start + (i - (b + last_idx_dim)); + out << SP << SP << "data_idx += oy_" << oy_dim << " * " << stridesData[i] << ";\n"; + } + + out << SP << SP << "tensor_" << fNY << "[out_idx] = tensor_" << fNData << "[data_idx];\n"; + out << SP << "}\n"; + + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("TMVA SOFIE GatherND called to Generate without being initialized first"); + + size_t r = fShapeData.size(); + size_t q = fShapeIndices.size(); + size_t b = static_cast(fBatchDims); + size_t last_idx_dim = fShapeIndices.back(); + + auto stridesData = UTILITY::ComputeStrideFromShape(fShapeData); + auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices); + auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY); + + size_t Dy = fShapeY.size(); + size_t totalOutput = ConvertShapeToLength(fShapeY); + + std::string kname = "GatherNDKernel_" + opName; + + std::string op; + op = "\n//------ GATHERND_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ data,\n"; + op += SP + SP + SP + "int64_t const* __restrict__ indices,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (size_t d = 0; d < Dy; ++d) { + op += SP + SP + SP + SP + "std::size_t const oy_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % " + + std::to_string(fShapeY[d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const idx_base =\n"; + // batch dims: oy_0..oy_{b-1} * stridesIndices[0..b-1] + // outer idx dims: oy_b..oy_{b+(q-b-2)} * stridesIndices[b..q-2] + bool first = true; + for (size_t i = 0; i < q - 1; ++i) { + op += SP + SP + SP + SP + SP + + (first ? "" : "+ ") + + "oy_" + std::to_string(i) + " * " + std::to_string(stridesIndices[i]) + "u\n"; + first = false; + } + if (first) op += SP + SP + SP + SP + SP + "0u\n"; // q==1: scalar index tuple + op += SP + SP + SP + SP + SP + ";\n\n"; + + op += SP + SP + SP + SP + "std::size_t data_idx =\n"; + first = true; + for (size_t i = 0; i < b; ++i) { + op += SP + SP + SP + SP + SP + + (first ? "" : "+ ") + + "oy_" + std::to_string(i) + " * " + std::to_string(stridesData[i]) + "u\n"; + first = false; + } + if (first) op += SP + SP + SP + SP + SP + "0u\n"; + op += SP + SP + SP + SP + SP + ";\n\n"; + + op += SP + SP + SP + SP + "// Read " + std::to_string(last_idx_dim) + "-element index tuple\n"; + for (size_t k = 0; k < last_idx_dim; ++k) { + size_t idx_offset = k; + size_t data_axis = b + k; + op += SP + SP + SP + SP + "{\n"; + op += SP + SP + SP + SP + SP + + "int64_t idx_val = indices[idx_base + " + + std::to_string(idx_offset) + "u];\n"; + op += SP + SP + SP + SP + SP + + "if (idx_val < 0) idx_val += " + + std::to_string(fShapeData[data_axis]) + ";\n"; + op += SP + SP + SP + SP + SP + + "data_idx += static_cast(idx_val) * " + + std::to_string(stridesData[data_axis]) + "u;\n"; + op += SP + SP + SP + SP + "}\n"; + } + op += "\n"; + + size_t y_trailing_start = b + (q - b - 1); + for (size_t i = b + last_idx_dim; i < r; ++i) { + size_t oy_dim = y_trailing_start + (i - (b + last_idx_dim)); + op += SP + SP + SP + SP + + "data_idx += oy_" + std::to_string(oy_dim) + + " * " + std::to_string(stridesData[i]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "output[elem_idx] = data[data_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "GatherNDKernel_" + opName; + return SP + kname + " gatherNDKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) + throw std::runtime_error("TMVA SOFIE GatherND called to Generate without being initialized first"); + + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::string kname = "gatherNDKernel_" + opName; + + std::stringstream out; + out << "\n//------ GATHERND_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "alpaka::KernelCfg const kernelCfg_" << opName + << " = {elementsPerGrid_" << opName << ", elementsPerThread_" << opName << "};\n"; + out << SP << "auto const workDiv_" << opName << " = alpaka::getValidWorkDiv(kernelCfg_" << opName + << ", devAcc, " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNData << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNData << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + return out.str(); + } +}; + +} // SOFIE + +#endif // SOFIE_ROPERATOR_GATHERND diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx index 046bf56..47efe01 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx @@ -23,6 +23,7 @@ namespace SOFIE{ private: bool fIsDynamic = false; + bool fBroadcastBias = false; float fAttrAlpha = 1.0; float fAttrBeta = 1.0; @@ -32,7 +33,6 @@ namespace SOFIE{ std::string fNA; std::string fNB; std::string fNC = ""; - std::string fNC2; // bias tensor name after broadcasting std::string fNY; std::string fType; EActivationType fActivation; @@ -40,6 +40,7 @@ namespace SOFIE{ std::vector fShapeB; std::vector fShapeC; std::vector fShapeY; + RModel * fModel = nullptr; public: @@ -48,6 +49,7 @@ namespace SOFIE{ fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)) { + fKind = OperatorKind::GEMM; fActivation = activation; fType = "float"; static_assert(std::is_same_v, @@ -60,9 +62,11 @@ namespace SOFIE{ fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation) { + fKind = OperatorKind::GEMM; fActivation = activation; fType = "float"; + fInputTensorNames = {fNA, fNB, fNC}; fOutputTensorNames = { fNY }; } @@ -72,7 +76,7 @@ namespace SOFIE{ } template - std::vector> DoShapeInference(const std::vector> & input){ + std::vector DoShapeInference(const std::vector> & input){ if (input.size() > 3) throw std::runtime_error("TMVA SOFIE Gemm Op Shape Inference only need 2 or 3 input tensor"); // accept tensor with input dimensions > 2 // example: A = (d1,d2,...,N1,N2) B = (d1,d2,...,N2,N3) --> Y = (d1,d2,..,N1,N3) @@ -82,11 +86,10 @@ namespace SOFIE{ } } - std::vector> ret; // when there are 3 inputs shape of Y is the one of C if (input.size() == 3){ - ret.push_back(input[2]); //shape of C is shape of Y - return ret; + //shape of C is shape of Y + return input[2]; } // ioffset cannot be less than 2 int ioffset = input[0].size()-2; // in case of tensors with dim > 2 @@ -105,6 +108,7 @@ namespace SOFIE{ if (input[0].size() > 2 && input[1].size() == input[0].size()) { // in case of dim > 2 first dimensions are equal to the input ones not // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4)) + // here could probably use the Broadcasting function UTILITY::MultidirectionalBroadcastShape for (size_t i = 0; i < input[0].size()-2; i++) { Dim valueA = input[0][i]; Dim valueB = input[1][i]; @@ -113,24 +117,41 @@ namespace SOFIE{ s_y.push_back(input[0][i]); else if (valueA.GetVal() == "1") s_y.push_back(input[1][i]); + else if (!valueA.isParam && !valueB.isParam) + throw std::runtime_error("TMVA SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and " + + valueB.GetVal()); + else if (valueA.isParam && valueB.isParam){ + // check which parameter is first in RModel list + auto & dimNames = fModel->GetDimShapeNames(); + auto p1 = std::find(dimNames.begin(), dimNames.end(), valueA.param); + auto p2 = std::find(dimNames.begin(), dimNames.end(), valueB.param); + if (p1 < p2) s_y.push_back(input[0][i]); + else s_y.push_back(input[1][i]); + } + else if (!valueA.isParam) + s_y.push_back(input[0][i]); + else if (!valueB.isParam) + s_y.push_back(input[1][i]); else throw std::runtime_error("TMVA SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and " + valueB.GetVal()); } - s_y.push_back(input[0][i]); + else + s_y.push_back(input[0][i]); } } s_y.push_back(s_a[0]); s_y.push_back(s_b[1]); - ret.push_back(s_y); - return ret; + return s_y; } std::vector> ShapeInference(std::vector> input) override { - return DoShapeInference(input); + std::vector> ret; + ret.push_back(DoShapeInference(input)); + return ret; } - std::vector> DynamicShapeInference(const std::vector> & input){ + std::vector DynamicShapeInference(const std::vector> & input){ return DoShapeInference(input); } @@ -138,6 +159,7 @@ namespace SOFIE{ void Initialize(RModel& model) override { //TODO: propagate A or B as specified by ONNX standard + fModel = &model; if ((model.CheckIfTensorAlreadyExist(fNA) == false) || (model.CheckIfTensorAlreadyExist(fNB) == false) ){ //input must be a graph input, or already initialized intermediate tensor throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor " + fNA + " or " + fNB + " is not found in model"); @@ -186,14 +208,8 @@ namespace SOFIE{ } } - fShapeY = DynamicShapeInference({fShapeA, fShapeB})[0]; - std::vector shapeY; - if (!fIsDynamic) { - shapeY = ConvertShapeToInt(fShapeY); - if (shapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertDynamicShapeToString(fShapeY)); - } - } + fShapeY = DynamicShapeInference({fShapeA, fShapeB}); + std::vector shapeY = ConvertShapeToInt(fShapeY); // bias is normally not dynamic (not support it for time being) if (fNC != ""){ @@ -202,38 +218,27 @@ namespace SOFIE{ throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is dynamic and is not supported"); } fShapeC = model.GetTensorShape(fNC); - fNC2 = fNC; - size_t lengthC = ConvertShapeToLength(fShapeC); - size_t lengthY = ConvertShapeToLength(shapeY); - // for dynamic outputs broadcasting is always done - bool broadcast_needed = lengthC != lengthY; + // for dynamic outputs broadcasting is always needed + bool broadcast_needed = false; + if (fIsDynamic && shapeY.empty()) + broadcast_needed = true; + else + // consider broadcasting also if same length + broadcast_needed = (fShapeC != shapeY); if (broadcast_needed) { - if (!model.UseSession()) { - // without session dynamic tensors not supported in Gemm - if (fIsDynamic) { - throw std::runtime_error("TMVA SOFIE Gemm Op: dynamic tensors not supported without a session"); - } - auto original_data = model.GetInitializedTensorData(fNC); - auto targetShape = UTILITY::UnidirectionalBroadcastShape(fShapeC, shapeY); - if (fType == "float") { - std::shared_ptr new_data_ptr(UTILITY::UnidirectionalBroadcast( - static_cast(original_data.get()), fShapeC, targetShape), - std::default_delete()); - - model.UpdateInitializedTensor(fNC, model.GetTensorType(fNC), shapeY, new_data_ptr); - fShapeC = shapeY; - } - } else { - // In case of session add broadcasting code in Session constructor and in GenerateInitCode - // we need to add a new intermediate tensor for broadcasted bias tensor - fNC2 = fNC + "bcast"; - if (!fIsDynamic) { - model.AddIntermediateTensor(fNC2, model.GetTensorType(fNC), shapeY); - } - else - model.AddDynamicTensor(fNC2,model.GetTensorType(fNC), fShapeY); + fBroadcastBias = true; + // check if broadcasting is compatible and note that prepend 1 to shapeC + auto shapeDimC = ConvertShapeToDim(fShapeC); + auto r = UTILITY::MultidirectionalBroadcastShape(fShapeY, shapeDimC); + // return flag must be equal to 1 since this is a unidirectional broadcast of C->Y + if (r.first > 1) { + throw std::runtime_error("TMVA SOFIE Gemm Op - bias tensor of shape " + ConvertShapeToString(fShapeC) + " cannot be uni-directional broadcasted to " + ConvertDimShapeToString(fShapeY)); + } + fShapeC = ConvertShapeToInt(shapeDimC); + if (fShapeC.empty()) { + throw std::runtime_error("TMVA SOFIE Gemm Op - Error in bias tensor " + ConvertDimShapeToString(shapeDimC) ); } } } @@ -260,7 +265,7 @@ namespace SOFIE{ if (model.Verbose()){ std::cout << "Gemm (or MatMul) " << " ---> " << fNY << " shape "; if (fIsDynamic) - std::cout << ConvertDynamicShapeToString(fShapeY) << std::endl; + std::cout << ConvertDimShapeToString(fShapeY) << std::endl; else std::cout << ConvertShapeToString(shapeY) << std::endl; } @@ -268,35 +273,167 @@ namespace SOFIE{ model.AddNeededStdLib("algorithm"); } - std::string GenerateInitCode() override { + std::string Generate(std::string opName) override { + opName = "op_" + opName; + + if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) { + throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first"); + } std::stringstream out; - // generate initialization code for broadcasting of bias tensor - if (fShapeC.size() != fShapeY.size() && fNC != fNC2) { - // we broadcast here always C in Y output, so target shape is the one of Y - // no need to call UTILITY::UnidirectionalBroadcastShape. - // here in case of parametric shape we need to assume that the parameters will be defined in the initialization code. - auto targetShape = fShapeY; - // include a separate scope to avoid defining unique operator temp variables - out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n"; - out << SP << "{\n"; - out << " float * data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" - << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertDynamicShapeToString(fShapeY) << ");\n"; - auto length = SOFIE::ConvertDynamicShapeToLength(fShapeY); // output size - out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC2 << ");\n"; - out << SP << SP << "delete [] data;\n"; + out << "\n//--------- Gemm " << opName << " " << ConvertDimShapeToString(fShapeA) << " * " << ConvertDimShapeToString(fShapeB) + << " -> " << ConvertDimShapeToString(fShapeY) << "\n"; + // need to consider case A and B have dim > 2 (for MatMul) + int64_t dimA = fShapeA.size(); + int64_t dimB = fShapeB.size(); + int64_t dimY = fShapeY.size(); + if (dimA != dimB || dimA != dimY) { + throw std::runtime_error("TMVA SOFIE Gemm(MatMul) has invalid shape for inputs or output"); + } + auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); + auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); + auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + // size of A: if (transposeA) is m*k else k*m + // size of B n*k + std::vector sY = {fShapeY[dimY-2], fShapeY[dimY-1]}; + // extra dimensions in case of stacked MatMul + std::vector sExtraY; + for (int64_t i = 0; i < dimY-2; i++) { + sExtraY.push_back(fShapeY[i]); + } + auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation + auto lengthExtra_Y = ConvertDimShapeToLength(sExtraY); // extra length in case input tensors are of dim>2 (MatMul) + + // case bias is present + if (!fNC.empty()){ + if (!fBroadcastBias) { + // add a check in case broadcasting was not needed or done outside of session + // C should have smaller dimension of Y + if (!fIsDynamic) { + if (std::stoi(lengthGemm) != static_cast(ConvertShapeToLength(fShapeC))) + throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor has not correct size " + + ConvertShapeToString(fShapeC) + " output length " + lengthGemm); + } else { + // add a dynamic check (C should not be a dynamic tensor) + out << SP << "assert(" << lengthGemm << " == " << ConvertShapeToLength(fShapeC) << ");\n"; + } + } + } else { + //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use + // the previous result + if (fAttrBeta != 0) { + throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero"); + } + } + + // include MatMul case where we stack the Gemm operations + // exclude case where we have only 1's in the additional dims + bool doStackMul = dimY > 2 && ( fIsDynamic || std::stoi(lengthExtra_Y) > 1); + // compute input offset for stack multiplications + std::string lengthExtra_A; + std::string lengthExtra_B; + std::string increment_A; + std::string increment_B; + + if (doStackMul) { + std::vector sA(fShapeA.begin(), fShapeA.begin()+dimA-2); + std::vector sB(fShapeB.begin(), fShapeB.begin()+dimB-2); + std::vector mA = {fShapeA[dimA-2], fShapeA[dimA-1]}; + std::vector mB = {fShapeA[dimB-2], fShapeB[dimB-1]}; + lengthExtra_A = ConvertDimShapeToLength(sA); + lengthExtra_B = ConvertDimShapeToLength(sB); + // size of A performing matmul is m*k and n*k for B + increment_A = ConvertDimShapeToLength(mA); + increment_B = ConvertDimShapeToLength(mB); + } + bool extraA = (doStackMul && lengthExtra_A != "1"); + bool extraB = (doStackMul && lengthExtra_B != "1"); + if (doStackMul) { + out << SP << "size_t " << opName << "_y_offset = 0;\n"; // needed if we stack the gemm operations + if (extraA) + out << SP << "size_t " << opName << "_A_offset = 0;\n"; + if (extraB) + out << SP << "size_t " << opName << "_B_offset = 0;\n"; + out << SP << "for (size_t i = 0; i < " << lengthExtra_Y << "; i++){\n"; + out << SP; + } + // do the bias broadcasting + if (fBroadcastBias) { + fAttrBeta = 1.; + out << SP << "for (size_t j = 0; j < " << sY[0] << "; j++) { \n"; + out << SP << SP << "size_t y_index = "; + if (doStackMul) // add offset in caseof stack multiplications (not sure if bias is present in these cases) + out << opName << "_y_offset + "; + if (sY[1].GetVal() != "1") + out << sY[1] << " * j;\n"; + else + out << "j;\n"; + + out << SP << SP << "for (size_t k = 0; k < " << sY[1] << "; k++) { \n"; + std::string bias_index; + if (fShapeC[0] == 1 && fShapeC[1] == sY[1].dim) + bias_index = "k"; + else if (fShapeC[1] == 1 && fShapeC[0] == sY[0].dim) + bias_index = "j"; + else if (fShapeC[0] == 1 && fShapeC[1] == 1) // scalar case + bias_index = "0"; + else { + throw std::runtime_error("TMVA SOFIE Gemm Op - invalid shape for bias tensor " + ConvertShapeToString(fShapeC)); + } + + out << SP << SP << SP << "tensor_" << fNY << "[y_index + k] = " << "tensor_" << fNC << "[" << bias_index << "];\n"; + out << SP << SP << "}\n"; out << SP << "}\n"; } + + if (fType == "float"){ + + out << SP << "TMVA::Experimental::SOFIE::Gemm_Call(" + << "tensor_" << fNY; + if (doStackMul) out << " + " << opName << "_y_offset"; + out << ", " + << (fAttrTransB ? "true, " : "false, ") + << (fAttrTransA ? "true, " : "false, ") + << n << ", " << m << ", " << k << ", "; + out << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ", tensor_" << fNB; + if (extraB) out << " + " << opName << "_B_offset"; + out << ", tensor_" << fNA; + if (extraA) out << " + " << opName << "_A_offset"; + out << ", " << std::setprecision(std::numeric_limits::max_digits10) << fAttrBeta << ","; + // in the case of bias and no broadcasting needed + if (!fNC.empty() && !fBroadcastBias) + out << "tensor_" << fNC; + else + out << "nullptr"; + out << ");\n"; + + if(fActivation == EActivationType::RELU){ + out << SP << "for (int id = 0; id < " << ConvertDimShapeToLength(fShapeY) << " ; id++){\n"; + out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNY << "[id] > 0 )? tensor_" << fNY << "[id] : 0);\n"; + out << SP << "}\n"; + } + } + + if (doStackMul) { + out << SP << SP << opName << "_y_offset += " << lengthGemm << ";\n"; + if (lengthExtra_A != "1") + out << SP << SP << opName << "_A_offset += " << increment_A << ";\n"; + if (lengthExtra_B != "1") + out << SP << SP << opName << "_B_offset += " << increment_B << ";\n"; + + out << "}\n"; // end of loop on the stacked multiplications + } + return out.str(); } - std::string Generate(std::string opName) override { + std::string Generate_GPU_ALPAKA(std::string opName) override { opName = "op_" + opName; if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) { throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first"); } std::stringstream out; - out << "\n//--------- Gemm\n"; + out << "\n//--------- Gemm_GPU_ALPAKA\n"; out << SP << "char " << opName << "_transA = " << (fAttrTransA ? "\'t\'" : "\'n\'") << ";\n"; out << SP << "char " << opName << "_transB = " << (fAttrTransB ? "\'t\'" : "\'n\'") << ";\n"; // need to consider case A and B have dim > 2 (for MatMul) @@ -315,20 +452,20 @@ namespace SOFIE{ for (int64_t i = 0; i < dimY-2; i++) { sA.push_back(fShapeY[i]); } - auto lengthGemm = ConvertDynamicShapeToLength(sY); // size of the Gemm operation - auto lengthExtra = ConvertDynamicShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul) + auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation + auto lengthExtra = ConvertDimShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul) out << SP << "int " << opName << "_m = " << m << ";\n"; out << SP << "int " << opName << "_n = " << n << ";\n"; out << SP << "int " << opName << "_k = " << k << ";\n"; out << SP << "float " << opName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << fAttrAlpha << ";\n"; - out << SP << "float " << opName << "_beta = " << std::setprecision(std::numeric_limits::max_digits10) << fAttrBeta << ";\n"; - out << SP << "int " << opName << "_lda = " << (fAttrTransA ? m : k) << ";\n"; - out << SP << "int " << opName << "_ldb = " << (fAttrTransB ? k : n) << ";\n"; + + // restricting to a 0 beta since BIAS is configured separately through sofieBLAS interface + out << SP << "float " << opName << "_beta = 0;\n"; // case bias is present if (!fNC.empty()){ - if (fNC2 == fNC) { + if (!fBroadcastBias) { // add a check in case broadcasting was not needed or done outside of session // C should have smaller dimension of Y if (!fIsDynamic) { @@ -358,39 +495,39 @@ namespace SOFIE{ } // in the case of bias if (!fNC.empty()){ - out << SP << "std::copy(" << "tensor_" << fNC2 << ", " << "tensor_" << fNC2 << " + " << lengthGemm << ", " - << "tensor_" << fNY; - if (doStackMul) out << " + " << opName << "_yoffset"; - out << ");\n"; - } - - - if (fType == "float"){ - - out << SP << "BLAS::sgemm_(&" << opName << "_transB, &" << opName << "_transA, &" << opName - << "_n, &" << opName << "_m, &" << opName << "_k, &" << opName << "_alpha, " << "tensor_" << fNB - << ", &" << opName << "_ldb, " << "tensor_" << fNA << ", &" << opName << "_lda, &" << opName << "_beta, " - << "tensor_" << fNY; - if (doStackMul) out << " + " << opName << "_yoffset"; - out << ", &" << opName << "_n);\n"; - - if(fActivation == EActivationType::RELU){ - out << SP << "for (int id = 0; id < " << SOFIE::ConvertDynamicShapeToLength(fShapeY) << " ; id++){\n"; - out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNY << "[id] > 0 )? tensor_" << fNY << "[id] : 0);\n"; - out << SP << "}\n"; + if (fActivation == EActivationType::RELU){ + out << SP << "blas.gemmrelu("< GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; } + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNY); + fNY = fusable_tensor_name; + fOutputTensorNames[0] = fNY; + } + std::string GetBlasConfig(){ + int64_t dimA = fShapeA.size(); + int64_t dimB = fShapeB.size(); + auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal()); + auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal()); + auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal()); + auto lda = (fAttrTransA ? m : k); + auto ldb = (fAttrTransB ? k : n); + auto ldc = n; + return n+", "+m+", "+k+", "+ldb+", "+lda+", "+ldc; + } }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc b/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc index bec7760..ebf4daf 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc +++ b/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc @@ -1,7 +1,6 @@ #ifndef SOFIE_ROPERATOR_LSTM_I #define SOFIE_ROPERATOR_LSTM_I - namespace SOFIE { template @@ -291,7 +290,7 @@ auto ROperator_LSTM::Generate(std::string OpName) // set the input if (fAttrLayout == 0) { - out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << fType << " const *" << OpName << "_input = tensor_" << fNX << ";\n"; } else { if (fUseSession) out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n"; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx index 17b77b3..12ea5b7 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx @@ -3,16 +3,15 @@ #include "SOFIE/RModel.hxx" #include "SOFIE/SOFIE_common.hxx" - #include #include - namespace SOFIE { template class ROperator_LayerNormalization : public ROperator { private: + bool fCastToFloat = false; // flag to indicate if operation 1 are in floats (to be impl) int fAttrAxis; float fAttrEpsilon; size_t fAttrStashType; @@ -30,7 +29,7 @@ private: std::vector fShapeX; std::vector fShapeScale; - std::vector fShapeB; // shape of input Bias (B) is assumed to be fully defined + std::vector fShapeB; std::vector fShapeY; std::vector fShapeMean; std::vector fShapeInvStdDev; @@ -39,8 +38,8 @@ private: size_t fSize; // Size of the input // size_t fAxisDim; - std::vector fNormalizedShape; - std::vector fAxesShape; + std::vector fNormalizedShape; // shape from X[ axis,...,N-1] + std::vector fAxesShape; // shape from X[0,..,axis-1] // lengths in string format std::string fLength; // Length of the input std::string fNormalizedLength; @@ -78,10 +77,10 @@ public: void Initialize(RModel& model) override { if (!model.CheckIfTensorAlreadyExist(fNX)) { - throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found."); + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Tensor " + fNX + " not found."); } bool isDynamic = model.IsDynamicTensor(fNX); - fShapeX = model.GetDynamicTensorShape(fNX); + fShapeX = model.GetDimTensorShape(fNX); fShapeY = fShapeX; model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY); // Type of the output @@ -93,18 +92,17 @@ public: // Shape of fShapeX[0, ..., fAxis) fAxesShape = std::vector(fShapeX.begin(), fShapeX.begin() + fAxis); // Length of the axes - fAxesLength = ConvertDynamicShapeToLength(fAxesShape); + fAxesLength = ConvertDimShapeToLength(fAxesShape); // Shape of fShapeX[fAxis, ..., fSize) fNormalizedShape = std::vector(fShapeX.begin() + fAxis, fShapeX.end()); // Length of the normalized axis - fNormalizedLength = ConvertDynamicShapeToLength(fNormalizedShape); + fNormalizedLength = ConvertDimShapeToLength(fNormalizedShape); // length of the input - fLength = ConvertDynamicShapeToLength(fShapeX); + fLength = ConvertDimShapeToLength(fShapeX); // Type of mean and std ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX); // Mean - if (fNMean.empty()) { - fNMean = "Mean" + fNX; + if (!fNMean.empty()) { // cannot use initializer list with one element since it is ambiguous if (isDynamic) // add size_t(-1) to indicate that shape is an expression @@ -113,29 +111,60 @@ public: model.AddIntermediateTensor(fNMean, type, std::vector(1,std::stoi(fAxesLength))); } // Inverse Standard Deviation - if (fNInvStdDev.empty()) { - fNInvStdDev = "InvStdDev" + fNX; + if (!fNInvStdDev.empty()) { if (isDynamic) model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,Dim{fAxesLength,std::size_t(-1)})); else model.AddIntermediateTensor(fNInvStdDev, type, std::vector(1,std::stoi(fAxesLength))); } + // if mean and stdev are not empty they are not defined in the output list // Cast X to float if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) { - fNCastedX = "Casted" + fNX; - model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX); - fNNormalizedX = "Normalized" + fNX; - model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX); + fCastToFloat = true; + fType = "float"; + // fNCastedX = "Casted" + fNX; + // model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX); + // fNNormalizedX = "Normalized" + fNX; + // model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX); + } + // scale shape + fShapeScale = model.GetDimTensorShape(fNScale); + // appends 1 to scale shapes if missing + size_t dimScale = fShapeScale.size(); + if (dimScale < fSize) { + for (size_t i = 0; i < fSize-dimScale; i++) + fShapeScale.insert(fShapeScale.begin(), Dim{1}); + } + // check also shape if consistent now + for (size_t i = 0; i < fSize; i++) { + if (fShapeScale[i].dim != 1 && fShapeScale[i] != fShapeX[i]) + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Scale Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale)); } - // Broadcast the bias if (!fNB.empty()) { - fShapeB = model.GetTensorShape(fNB); - size_t lengthB = ConvertShapeToLength(fShapeB); - if (isDynamic || lengthB < static_cast(std::stoi(fLength))) { - fNBroadcastedB = "Broadcasted" + fNB; - model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX); + fShapeB = model.GetDimTensorShape(fNB); + // appends 1 to bias shapes if missing + size_t dimB = fShapeB.size(); + if (dimB < fShapeX.size()) { + for (size_t i = 0; i < fSize-dimB; i++) + fShapeB.insert(fShapeB.begin(), Dim{1}); + } + for (size_t i = 0; i < fSize; i++) { + if (fShapeB[i].dim != 1 && fShapeB[i] != fShapeX[i]) + throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Bias Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale)); } } + + std::cout << "bias + scale " << ConvertDimShapeToString(fShapeB) << " " << ConvertDimShapeToString(fShapeScale) << std::endl; + + // // Broadcast the bias + // if (!fNB.empty()) { + // fShapeB = model.GetTensorShape(fNB); + // size_t lengthB = ConvertShapeToLength(fShapeB); + // if (isDynamic || lengthB < static_cast(std::stoi(fLength))) { + // fNBroadcastedB = "Broadcasted" + fNB; + // model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX); + // } + // } model.AddNeededStdLib("cmath"); } @@ -145,8 +174,8 @@ public: if (!fNBroadcastedB.empty()) { out << SP << "// Broadcasting the bias of LayerNormalization op\n"; out << SP << "{\n"; - out << SP << SP << "float* data = SOFIE::UTILITY::UnidirectionalBroadcast(tensor_"; - out << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertDynamicShapeToString(fShapeX) << ");\n"; + out << SP << SP << "float* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_"; + out << fNB << ", " << ConvertDimShapeToString(fShapeB) << ", " << ConvertDimShapeToString(fShapeX) << ");\n"; out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n"; out << SP << "delete[] data;\n"; out << SP << "}\n"; @@ -161,10 +190,6 @@ public: throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName + " called to generate without being initialized first."); } - if (fShapeX.size() > 5) { - throw std::runtime_error("TMVA::SOFIE LayerNormalization operator not " - "implemented for input tensor of size > 5."); - } std::stringstream out; @@ -178,10 +203,32 @@ public: } auto strides = UTILITY::ComputeStrideFromShape(fShapeX); - std::string InputIndex = "axis_0 * " + strides[0].GetVal(); + std::string inputIndex = "axis_0 * " + strides[0].GetVal(); for (size_t i = 1; i < fSize; i++) { - InputIndex += " + axis_" + std::to_string(i) + " * " + strides[i].GetVal(); + inputIndex += " + axis_" + std::to_string(i); + if (i < fSize-1) inputIndex += " * " + strides[i].GetVal(); + } + auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale); + std::string scaleIndex; + for (size_t i = 0; i < fSize; i++) { + if (fShapeScale[i].dim != 1) { + if (!scaleIndex.empty()) scaleIndex += " + "; + scaleIndex += "axis_" + std::to_string(i); + if ( scaleStrides[i].dim != 1) scaleIndex += " * " + scaleStrides[i].GetVal(); + } + } + if (scaleIndex.empty()) scaleIndex = "0"; + + auto biasStrides = UTILITY::ComputeStrideFromShape(fShapeB); + std::string biasIndex; + for (size_t i = 0; i < fSize; i++) { + if (fShapeB[i].dim != 1) { + if (!biasIndex.empty()) biasIndex += " + "; + biasIndex += "axis_" + std::to_string(i); + if ( biasStrides[i].dim != 1) biasIndex += " * " + biasStrides[i].GetVal(); + } } + if (biasIndex.empty()) biasIndex = "0"; auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape); std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal(); @@ -189,51 +236,33 @@ public: axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal(); } - auto normalizedStrides = UTILITY::ComputeStrideFromShape(fNormalizedShape); - std::string normalizedIndex = "axis_" + std::to_string(fAxis) + " * " + normalizedStrides[0].GetVal(); - for (size_t i = fAxis + 1; i < fSize; i++) { - normalizedIndex += " + axis_" + std::to_string(i) + " * " + normalizedStrides[i - fAxis].GetVal(); - } - if (!fNCastedX.empty()) { - // Cast X to float - out << SP << "for (size_t i = 0; i < " << fLength << "; i++) {\n"; - out << SP << SP << "tensor_" << fNCastedX << "[i] = " << "static_cast(tensor_" << fNX; - out << "[i]);\n"; - out << SP << "}\n"; - } + // compute mean and std-dev. Save in tensors if requested out << SP << "// Compute the mean\n"; - // Loop over the normalized dimensions + + // Loop over all the outer dims in [0, fAxis) for (size_t i = 0; i < fAxis; i++) { std::string iIdx = "axis_" + std::to_string(i); out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] << "; " << iIdx << "++) {\n"; } - out << SP << SP << fType << " sum = 0.;\n"; - // loop over all the dims in [0, fAxis) + out << SP << SP << fType << " mean = 0.;\n"; + // loop over the normalized dimensions (fAxis,....,N-1) for (size_t j = fAxis; j < fSize; j++) { std::string jIdx = "axis_" + std::to_string(j); out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx << "++) {\n"; } - out << SP << SP << SP << "sum += tensor_" << fNX << "[" << InputIndex << "];\n"; + out << SP << SP << SP << "mean += tensor_" << fNX << "[" << inputIndex << "];\n"; for (size_t j = fAxis; j < fSize; j++) { out << SP << SP << "}\n"; } - out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = sum / " << fType << "("; - out << fNormalizedLength << ");\n"; - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } + out << SP << SP << "mean /= " << fType << "(" << fNormalizedLength << ");\n"; + out << SP << "// Compute the inverse Standard Deviation\n"; - // Loop over the normalized dimensions - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } + // Set sum = 0 out << SP << SP << fType << " sum = 0.;\n"; // loop over all the dims in [0, fAxis) @@ -242,91 +271,46 @@ public: out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx << "++){\n"; } - out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << InputIndex << "] - tensor_" - << fNMean << "[" << axesIndex << "];\n"; + out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << inputIndex << "] - mean;\n"; out << SP << SP << SP << "sum += tmp*tmp;\n"; for (size_t j = fAxis; j < fSize; j++) { out << SP << SP << "}\n"; } - out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = 1 / std::sqrt("; + out << SP << SP << fType << " invStdDev = 1 / std::sqrt("; out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n"; - for (size_t i = 0; i < fAxis; i++) { - out << SP << "}\n"; - } - if (!fNCastedX.empty()) { - out << "// NormalizedX = InvStdDev * (CastedX - Mean)\n"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNNormalizedX << "[" << InputIndex << "] = tensor_"; - out << fNInvStdDev << "[" << axesIndex << "] * (tensor_" << fNCastedX << "[" << InputIndex; - out << "] - tensor_" << fNMean << "[" << axesIndex << "])\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - out << "// Y = Scale o NormalizedX"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale; - out << "[" << axesIndex << "] * static_cast<" << fType << ">(tensor_" << fNCastedX << "[" << InputIndex; - out << "]);\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } - } else { - out << SP << "// Y = Scale o InvStdDev (X - Mean)\n"; - for (size_t i = 0; i < fAxis; i++) { - std::string iIdx = "axis_" + std::to_string(i); - out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i] - << "; " << iIdx << "++){\n"; - } - for (size_t j = fAxis; j < fSize; j++) { - std::string jIdx = "axis_" + std::to_string(j); - out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] - << "; " << jIdx << "++){\n"; - } - out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale; - out << "[" << normalizedIndex << "] * tensor_" << fNInvStdDev << "[" << axesIndex; - out << "] * (tensor_" << fNX << "[" << InputIndex << "] - tensor_" << fNMean << "["; - out << axesIndex << "]);\n"; - for (size_t j = fAxis; j < fSize; j++) { - out << SP << SP << "}\n"; - } - for (size_t i = fAxis; i < fSize; i++) { - out << SP << "}\n"; - } + + // set output mean and invStdDev if requested + if (!fNMean.empty()) + out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = mean;\n"; + if (!fNInvStdDev.empty()) + out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = invStdDev;\n"; + + // scale and add bias + + out << SP << "// Y = Scale o InvStdDev (X - Mean)\n"; + + for (size_t j = fAxis; j < fSize; j++) { + std::string jIdx = "axis_" + std::to_string(j); + out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx + << "++){\n"; } + out << SP << SP << SP << "tensor_" << fNY << "[" << inputIndex << "] = tensor_" << fNScale; + out << "[" << scaleIndex << "] * invStdDev * (tensor_" << fNX << "[" << inputIndex << "] - mean)"; - if (!fNB.empty()) { - std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB); - out << SP << "// Add the bias to Y\n"; - out << SP << "int " << opName << "_n = " << fLength << ";\n"; - out << SP << "float " << opName << "_alpha = 1.;\n"; - out << SP << "int " << opName << "_inc = 1;\n"; - out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &"; - out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n"; + // add bias if needed + if (!fNB.empty()) + // assume bias has index as scale + out << " + tensor_" << fNB << "[" << biasIndex << "]"; + out << ";\n"; + + // close loops on normalizing dim [..,fAxis,...fSize-1] + for (size_t j = fAxis; j < fSize; j++) { + out << SP << SP << "}\n"; + } + // close loops on the other dimensions [0,...,fAxis] + for (size_t i = 0; i < fAxis; i++) { + out << SP << "}\n"; } return out.str(); @@ -339,5 +323,4 @@ public: } // namespace SOFIE - #endif diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx index 8fefa6d..1218b56 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx @@ -27,7 +27,8 @@ public: ROperator_LeakyRelu(){} ROperator_LeakyRelu(float alpha,std::string nameX, std::string nameY): falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)) - { + { + fKind = OperatorKind::LEAKYRELU; if(std::is_same::value){ fType = "float"; } @@ -75,6 +76,61 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ LEAKY_RELU_KERNEL_ALPAKA\n"; + op += "struct LeakyReluKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements, T alpha) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < numElements) {\n"; + op += SP + SP + SP + "out[idx] = data[idx] >= T(0) ? data[idx] : alpha * data[idx];\n"; + op += SP + SP + "}\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return "LeakyReluKernel leakyReluKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator LeakyRelu called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertShapeToLength(fShape); + out << "\n//------ LEAKY_RELU_GPU_ALPAKA\n"; + out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits::max_digits10) << falpha << ";\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"< const kernelCfg_" << fNX << " = {elementsPerGrid_" << fNX << ", elementsPerThread_" << fNX << "};\n"; + out << SP << "auto const workDiv_" << fNX << " = alpaka::getValidWorkDiv(kernelCfg_" << fNX << ", devAcc, leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "), " << OpName << "_alpha);\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNX + << ", leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "), " << OpName << "_alpha);\n"; + out << SP <<"alpaka::wait(queue);\n"; + return out.str(); + } + + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc b/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc index c03c1c2..c10c2a5 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc +++ b/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc @@ -1,7 +1,6 @@ #ifndef SOFIE_ROPERATOR_RNN_I #define SOFIE_ROPERATOR_RNN_I - namespace SOFIE { template @@ -230,7 +229,7 @@ auto ROperator_RNN::Generate(std::string OpName) // set the input if (fAttrLayout == 0) { if (fType == "float") { - out << SP << "float *" << OpName << "_input = tensor_" << fNX << ";\n"; + out << SP << "float const*" << OpName << "_input = tensor_" << fNX << ";\n"; } } else { if (fUseSession) diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx index 8af272d..fea9814 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx @@ -8,7 +8,6 @@ #include #include - namespace SOFIE{ template @@ -89,9 +88,9 @@ public: model.AddDynamicTensor(fNOutput, type, fShape); } if (model.Verbose()) { - std::cout << "Range -> output is " << fNOutput << " "; - if (fIsOutputConstant) std::cout << ConvertDynamicShapeToString(fShape) << std::endl; - else std::cout << ConvertDynamicShapeToString(model.GetDynamicTensorShape(fNOutput)) << std::endl; + std::cout << "Range -> output is " << fNOutput << " : " << ConvertDimShapeToString(fShape); + if (fIsOutputConstant) std::cout << " : " << ConvertValuesToString(model.GetTensorData(fNOutput)); + std::cout << std::endl; } } @@ -121,5 +120,5 @@ public: }; }//SOFIE - + #endif //SOFIE_ROPERATOR_RANGE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx index 8062dca..fcc3cd6 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx @@ -24,6 +24,7 @@ public: ROperator_Relu(){} ROperator_Relu(std::string nameX, std::string nameY): fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::RELU; fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; } @@ -42,11 +43,11 @@ public: throw std::runtime_error("TMVA SOFIE Relu Op Input Tensor " + fNX + " is not found in model"); } - fShape = model.GetDynamicTensorShape(fNX); + fShape = model.GetDimTensorShape(fNX); model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape); if (model.Verbose()) { - std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDynamicShapeToString(fShape) << std::endl; + std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDimShapeToString(fShape) << std::endl; } } @@ -57,7 +58,7 @@ public: throw std::runtime_error("TMVA SOFIE Operator Relu called to Generate without being initialized first"); } std::stringstream out; - auto length = ConvertDynamicShapeToLength(fShape); + auto length = ConvertDimShapeToLength(fShape); out << "\n//------ RELU\n"; out << SP << "for (int id = 0; id < " << length << " ; id++){\n"; out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] > 0 )? tensor_" << fNX << "[id] : 0);\n"; @@ -65,6 +66,59 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { + std::string op; + op = "\n//------ RELU_KERNEL_ALPAKA\n"; + + op = "\n//------ RELU_KERNEL_ALPAKA\n"; + op += "struct ReluKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n"; + op += SP + SP + SP + "auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (idx < numElements) {\n"; + op += SP + SP + SP + "out[idx] = data[idx] >= T(0) ? data[idx] : 0;\n"; + op += SP + SP + "}\n"; + op += SP + "}\n"; + op += "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "ReluKernel reluKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Relu called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertDimShapeToLength(fShape); + out << "\n//------ RELU_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"< const kernelCfg_" << fNY << " = {elementsPerGrid_" << fNY << ", elementsPerThread_" << fNY << "};\n"; + out << SP << "auto const workDiv_" << fNY << " = alpaka::getValidWorkDiv(kernelCfg_" << fNY << ", devAcc, reluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNY + << ", reluKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + return out.str(); + } + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx index 66a7e09..2b3391c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx @@ -6,7 +6,10 @@ #include "SOFIE/RModel.hxx" #include +#include #include +#include + namespace SOFIE{ @@ -19,17 +22,20 @@ class ROperator_Reshape final : public ROperator private: bool fVerbose = false; + bool fDimInput = false; + bool fDynamicShape = false; ReshapeOpMode fOpMode = Reshape; // type of Reshape operator int fAllowZero = 0; // (for Reshape) zero in tensor shape makes output shape equal to input tensor shape int fAxis = 1; // (for Flatten) std::string fNData; // input data tensor name - std::string fNShape; // reshape tensor name + std::string fNInput2; // reshape or axes tensor name depending on operator std::string fNOutput; // output tensor name - std::vector fShapeInput; // input shape data - std::vector fShapeOutput; // output shape data + std::vector fShapeInput; // input shape data + std::vector fShapeOutput; // output shape data std::vector fAttrAxes; // axes attributes (provided for all version of Squeeze/Unsqueeze) + std::vector fShape; // shape tensor values provided for Reshape public: @@ -42,16 +48,16 @@ public: } ROperator_Reshape(){} - ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameShape, std::string nameOutput) - : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNShape(UTILITY::Clean_name(nameShape)), - fNOutput(UTILITY::Clean_name(nameOutput)) + ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameInput2, std::string nameOutput) + : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNInput2(UTILITY::Clean_name(nameInput2)), + fNOutput(UTILITY::Clean_name(nameOutput)) { if (opMode == Reshape) fAllowZero = attr_value; if (opMode == Flatten) fAxis = attr_value; fInputTensorNames = { fNData }; - if(!fNShape.empty()){ - fInputTensorNames.emplace_back(fNShape); + if(!fNInput2.empty()){ + fInputTensorNames.emplace_back(fNInput2); } fOutputTensorNames = { fNOutput }; } @@ -63,6 +69,8 @@ public: fAttrAxes(attrAxes) { assert(fOpMode == Squeeze || fOpMode == Unsqueeze); + fInputTensorNames = { fNData }; + fOutputTensorNames = { fNOutput }; } // output type is same as input @@ -70,94 +78,165 @@ public: auto ret = std::vector(1, input[0]); return ret; } + std::vector> ShapeInference(std::vector> input) override { + return input; + } // output shape - std::vector> ShapeInference(std::vector> input) override { - std::vector> ret; + std::vector> ShapeInference(const std::vector> & input) { + std::vector> ret; auto & input_shape = input[0]; - if (fOpMode == Reshape) { - if (input.size() != 2) throw std::runtime_error("TMVA SOFIE Reshape Op needs 2 input tensors"); - auto output_shape = input[1]; // the provided shape - size_t input_length = ConvertShapeToLength(input_shape); - size_t output_length = ConvertShapeToLength(output_shape); - // (input_length == output_length) is the easy case : (2,3,4) -> (2,12) - if (input_length != output_length) { - if ((output_length == 0 && fAllowZero == 0) || static_cast(output_length) < 0) { - // in this case value 0 or -1 in shape are automatically corrected - bool replacementDone = false; - for (size_t i = 0; i < output_shape.size(); i++) { - if (output_shape[i] == 0 || output_shape[i] == static_cast(-1)) { - if (replacementDone) { - throw std::runtime_error("TMVA Reshape Op : output shape has multiple negative or zero values"); + // correct the provided shape (here we have the value) for 0 or -1 + std::vector output_shape(fShape.size()); + assert(!fShape.empty() && !fDynamicShape); + for (size_t i = 0; i < output_shape.size(); i++) { + if (fShape[i] > 0 || (fAllowZero && fShape[i] >= 0)) + output_shape[i] = Dim{ static_cast(fShape[i]) }; + else if (!fAllowZero && fShape[i] == 0) + output_shape[i] = input_shape[i]; + } + // now case of -1 in shape + for (size_t i = 0; i < output_shape.size(); i++) { + if (fShape[i] == -1) { + auto tmp = output_shape; + tmp.erase(tmp.begin() + i); + auto tmp_length = ConvertDimShapeToLength(tmp); + auto input_length = ConvertDimShapeToLength(input_shape); + if (fVerbose) + std::cout << "reshape- try simplifying " << ConvertDimShapeToString(input_shape) << " with length " + << input_length << " to " << tmp_length << std::endl; + + if (IsInteger(tmp_length) && IsInteger(input_length)) + output_shape[i] = Dim{static_cast(std::stoi(input_length) / std::stoi(tmp_length))}; + else if (IsInteger(tmp_length) && std::stoi(tmp_length) == 1) { + output_shape[i] = Dim{input_length, static_cast(-1)}; + } + else { + //we can try simplifying expression if tmp_length is integer and part of input_length + // contains tmp_length + bool canSimplify = false; + std::vector reduced_input; + if (IsInteger(tmp_length)) { + + // try to tokenize with * the input length + + std::stringstream ss(input_length); + + std::string token; + + // Tokenizing w.r.t. space '*' + while(getline(ss, token, '*')) + { + // remove any whitespace + token.erase(std::remove_if(token.begin(), token.end(), + [](unsigned char x) { return std::isspace(x); }), token.end()); + if (token != tmp_length) { + if (IsInteger(token)) { + size_t il = static_cast(std::stoi(input_length)); + size_t tl = static_cast(std::stoi(tmp_length)); + if ((il % tl) == 0) { + canSimplify = true; + reduced_input.push_back(Dim{il / tl}); + } + } else { + reduced_input.push_back(Dim{token}); + } + } else { + // token is equal to tmp_length, can be not considered and is simplified + canSimplify = true; + } } - auto tmp = output_shape; - tmp.erase(tmp.begin() + i); - auto tmp_length = ConvertShapeToLength(tmp); - output_shape[i] = input_length / tmp_length; - replacementDone = true; } + if (canSimplify) { + // if length contains * we need to add some brackets + std::string res_shape = ConvertDimShapeToLength(reduced_input); + if (res_shape.find('*') != std::string::npos) + output_shape[i] = Dim{std::string("(") + res_shape + ")", static_cast(-1)}; + else + output_shape[i] = Dim{res_shape}; + } + if (!canSimplify) + output_shape[i] = Dim{std::string("(") + input_length + " / (" + tmp_length + "))", static_cast(-1)}; } - if (fVerbose) - std::cout << "Reshape: correct output shape from " << ConvertShapeToString(input[1]) - << " to " << ConvertShapeToString(output_shape) << std::endl; - } - if (ConvertShapeToLength(output_shape) != input_length) { - throw std::runtime_error("TMVA Reshape Op : Invalid shapes : " + ConvertShapeToString(input_shape) + - ConvertShapeToString(output_shape)); + + break; // cannot have more than -1 } + // throw std::runtime_error( + // "TMVA Reshape Op : output shape has multiple negative or zero values"); + } + + if (fVerbose) + std::cout << "Reshape: correct output shape to " << ConvertDimShapeToString(output_shape) << std::endl; + + if (!fDimInput && ConvertDimShapeToLength(output_shape) != ConvertDimShapeToLength(input_shape)) { + throw std::runtime_error("TMVA Reshape Op : Invalid shapes : " + ConvertDimShapeToString(input_shape) + + ConvertDimShapeToString(output_shape)); } ret.push_back(output_shape); } else if (fOpMode == Flatten) { - // flattenig case - size_t inputSize = ConvertShapeToLength(input_shape); - size_t b = input[0][0]; - std::vector newShape = {b, inputSize / b}; + // flatten case + if (fAxis < 0) + fAxis += input_shape.size(); + auto s1 = std::vector(input_shape.begin(), input_shape.begin() + fAxis); + auto s2 = std::vector(input_shape.begin() + fAxis, input_shape.end()); + auto l1 = ConvertDimShapeToLength(s1); + auto l2 = ConvertDimShapeToLength(s2); + std::vector newShape = {Dim{l1}, Dim{l2}}; ret.push_back(newShape); - } else if (fOpMode == Squeeze) { // squeeze // assume no axis is provided - remove all axes with value equal to 1 - auto output_shape = input[0]; - if (input.size() == 1) { + auto output_shape = input_shape; + if (fAttrAxes.empty()) { size_t i = 0; while (i < output_shape.size()) { - if (output_shape[i] == 1 ) { + if (output_shape[i] == Dim{1}) { output_shape.erase(output_shape.begin() + i); - } - else { + } else { i++; } } - } else if (input.size() == 2) { - auto & axes = input[1]; - for (size_t i = 0; i < axes.size(); i++){ - if (output_shape[axes[i]] != 1) - throw std::runtime_error("TMVA Squeeze Op : Invalid axes : " + ConvertShapeToString(axes) + - ConvertShapeToString(output_shape)); - output_shape.erase(output_shape.begin() + axes[i]); + } else { + std::cout << "getting shape for Squeeze...from attribute\n"; + auto axes = fAttrAxes; + for (size_t i = 0; i < axes.size(); i++) { + std::cout << i << " " << axes[i] << std::endl; + if (axes[i] < 0) + axes[i] += input_shape.size(); + if (!(output_shape[axes[i]] == Dim{1})) + throw std::runtime_error("TMVA Squeeze Op : Invalid axis value " + std::to_string(axes[i]) + + " for " + ConvertDimShapeToString(output_shape)); + } + // for calling vector::erase we must sort axes in decreasing order to avoid + std::sort(axes.begin(), axes.end(), std::greater()); + for (auto & axis : axes) { + std::cout << "erase give axis " << axis << " -> "; + for (auto & o : output_shape) std::cout << o << " , "; + std::cout << std::endl; + output_shape.erase(output_shape.begin() + axis); } } ret.push_back(output_shape); } - else if (fOpMode == Unsqueeze) { // unsqueeze - assert(input.size() == 2); - auto output_shape = input[0]; - auto &axes = input[1]; + std::cout << "doing unsqueeze....\n"; + assert(!fAttrAxes.empty()); + auto output_shape = input_shape; + auto &axes = fAttrAxes; // output rank int64_t r = input[0].size() + axes.size(); - for (auto & a : axes) { + for (auto &a : axes) { int64_t i = static_cast(a); - if ( i < -r || i > r - 1 ) + if (i < -r || i > r - 1) throw std::runtime_error("TMVA Unsqueeze Op - axes input is not in correct range"); if (i >= 0) - output_shape.insert(output_shape.begin() + i, 1); + output_shape.insert(output_shape.begin() + i, Dim{1}); else - //negative axes - output_shape.insert(output_shape.end() + i + 1, 1); + // negative axes + output_shape.insert(output_shape.end() + i + 1, Dim{1}); } ret.push_back(output_shape); } @@ -167,33 +246,55 @@ public: void Initialize(RModel& model) override { fVerbose = model.Verbose(); + if (fVerbose) + std::cout << "initialize reshape op type " << fOpMode << " - " << fNInput2 << " " << fNData << std::endl; + if (model.CheckIfTensorAlreadyExist(fNData) == false) { // input must be a graph input, or already initialized intermediate tensor throw std::runtime_error("TMVA Reshape Op Input Tensor " + fNData + " is not found in model"); } - fShapeInput = model.GetTensorShape(fNData); - // check if optional shape tensor exist - if (!fNShape.empty()) { - if (model.CheckIfTensorAlreadyExist(fNShape)) { - auto dptr = model.GetInitializedTensorData(fNShape); - auto input_shape = static_cast(dptr.get()); - auto vec = model.GetTensorShape(fNShape); - assert(vec.size() == 1); - size_t n = vec[0]; // size of shape input tensor - - std::vector descShape(n); - std::copy(input_shape, input_shape + n, descShape.begin()); - fShapeOutput = ShapeInference({fShapeInput, descShape})[0]; - // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed - model.SetNotWritableInitializedTensor(fNShape); + fShapeInput = model.GetDimTensorShape(fNData); + fDimInput = model.IsDynamicTensor(fNData); + // check if optional tensor exists defining shape or axes + if (!fNInput2.empty()) { + if (model.CheckIfTensorAlreadyExist(fNInput2)) { + if (model.IsInitializedTensor(fNInput2)) { + // assume input shape is an initialized tensor + auto dptr = model.GetInitializedTensorData(fNInput2); + auto values = static_cast(dptr.get()); + auto vec = model.GetTensorShape(fNInput2); + size_t n = 1; + if (vec.size() > 0) + n = vec[0]; // size of shape input tensor + // copy values in fShape vector or fAttrAxes + if (fOpMode == Reshape) + fShape = std::vector(values, values + n); + else + fAttrAxes = std::vector(values, values + n); + + fShapeOutput = ShapeInference({fShapeInput})[0]; + // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed + model.SetNotWritableInitializedTensor(fNInput2); + } else if (model.IsShapeTensor(fNInput2)) { + auto shapeData = model.GetShapeTensorValues(fNInput2); + fShapeOutput = shapeData; + } else { + // we cannot get shape at initialization time but at run-time + fDynamicShape = true; + // size of shape output us given by size of shape input tensor + auto shapeInput2 = model.GetTensorShape(fNInput2); + fShapeOutput.resize(shapeInput2[0]); + for (size_t i = 0; i < fShapeOutput.size(); i++) { + fShapeOutput[i] = Dim{ std::string("s_") + fNOutput + "_" + std::to_string(i)}; + } + } } else { - throw std::runtime_error("TMVA Reshape Op Shape Tensor " + fNShape + " is not found in model"); + throw std::runtime_error("TMVA Reshape Op 2nd input Tensor " + fNInput2 + " is not found in model"); } } else if (!fAttrAxes.empty()) { - // case fNShape is empty and axes are provided as attributes - std::vector descShape(fAttrAxes.size()); - std::copy(fAttrAxes.begin(), fAttrAxes.end(), descShape.begin()); - fShapeOutput = ShapeInference({fShapeInput, descShape})[0]; + // case fNShape is empty and axes are provided as attributes (e.g. for Unsqueeze) + std::cout << "attribute axes exists\n"; + fShapeOutput = ShapeInference({fShapeInput})[0]; } else if (fOpMode == Flatten || fOpMode == Squeeze) { fShapeOutput = ShapeInference({fShapeInput})[0]; } else { @@ -203,47 +304,103 @@ public: if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) { fIsOutputConstant = true; auto inputData = static_cast(model.GetInitializedTensorData(fNData).get()); - if (ConvertShapeToLength(fShapeInput) != ConvertShapeToLength(fShapeOutput)) + auto o_shape = ConvertShapeToInt(fShapeOutput); + if (ConvertShapeToLength(ConvertShapeToInt(fShapeInput)) != ConvertShapeToLength(o_shape) ) throw std::runtime_error("TMVA Reshape Op : Invalid Input/Output lengths"); - model.AddConstantTensor(fNOutput, fShapeOutput, inputData); + model.AddConstantTensor(fNOutput, o_shape, inputData); if (model.Verbose()) { - std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " --> " << fNOutput << " (constant) " << ConvertShapeToString(fShapeOutput) << " : " << - ConvertValuesToString(ConvertShapeToLength(fShapeOutput), inputData) << std::endl; + std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " --> " << fNOutput << " (constant) " << ConvertDimShapeToString(fShapeOutput) << " : " << + ConvertValuesToString(ConvertShapeToLength(o_shape), inputData) << std::endl; } - } else { + } + // for shape tensors we can have it if output shape is size==1 or a scalar + else if (model.IsShapeTensor(fNData) && fShapeOutput.size() <=1) { + fIsOutputConstant = true; + auto inputData = model.GetShapeTensorValues(fNData); + model.AddShapeTensor(fNOutput, inputData); + if (model.Verbose()) { + std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " --> " << fNOutput << " (shape) " << ConvertDimShapeToString(fShapeOutput) << " : " << + ConvertDimShapeToString(inputData) << std::endl; + } + } + else { // non-constant case model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput); if (model.Verbose()) - std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " --> "<< fNOutput << " " << ConvertShapeToString(fShapeOutput) << std::endl; + std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " --> "<< fNOutput << " " << ConvertDimShapeToString(fShapeOutput) << std::endl; } } - std::string Generate(std::string OpName) override { + std::string Generate(std::string opName) override { if (fIsOutputConstant) return ""; //no op for constant tensors - OpName = "op_" + OpName; - - // output of reshape is same as input - size_t length = ConvertShapeToLength(fShapeOutput); - if (length != ConvertShapeToLength(fShapeInput)) { - throw std::runtime_error("TMVA SOFIE Reshape Op : wrong output shape - is " + - ConvertShapeToString(fShapeOutput) + " and input is " + - ConvertShapeToString(fShapeInput)); - } std::stringstream out; - std::string opName = "Reshape"; + std::string opType = "Reshape"; if (fOpMode == Flatten) - opName = "Flatten"; + opType = "Flatten"; else if (fOpMode == Squeeze) - opName = "Squeeze"; + opType = "Squeeze"; else if (fOpMode == Unsqueeze) - opName = "Unsquueze"; + opType = "Unsquueze"; + + out << SP << "///--------" << opType << " operator " << opName << " --> " << ConvertDimShapeToString(fShapeOutput) << "\n"; - out << SP << "///--------" << opName << " operator\n" << std::endl; - out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << length << ", " << "tensor_" << fNOutput + // in case of dynamic output shape we need to set the shape value from input shape tensor + // and take case of the zero values + if (fDynamicShape) { + for (size_t i = 0; i < fShapeOutput.size(); i++) { + // since fNInput2 values are int64_t, should we check if they are negative? + out << SP << "size_t " << fShapeOutput[i].param << " = " << "tensor_" << fNInput2 << "[" << i << "];\n"; + if (!fAllowZero) + out << SP << "if (tensor_" << fNInput2 << "[" << i << "] <= 0 ) " + << fShapeOutput[i].param << " = " << fShapeInput[i] << ";\n"; + } + } + + // output of reshape is same as input + auto lengthOut = ConvertDimShapeToLength(fShapeOutput); + auto lengthIn = ConvertDimShapeToLength(fShapeInput); + if (lengthOut != lengthIn) { + // check needs to be done at run-time + out << SP << "if (" << lengthOut << "!=" << lengthIn << ")\n"; + out << "throw std::runtime_error(\"TMVA SOFIE Reshape Op : output lengths is different than input one\");\n"; + } + + + out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << lengthIn << ", " << "tensor_" << fNOutput << ");\n"; return out.str(); } +std::string Generate_GPU_ALPAKA(std::string opName) override { + if (fIsOutputConstant) return ""; + + opName = "op_" + opName; + + std::string opType = "Reshape"; + if (fOpMode == Flatten) opType = "Flatten"; + else if (fOpMode == Squeeze) opType = "Squeeze"; + else if (fOpMode == Unsqueeze) opType = "Unsqueeze"; + + std::stringstream out; + out << SP << "///------- " << opType << " operator " << opName << "\n"; + + if (fDynamicShape) { + auto lengthOut = ConvertDimShapeToLength(fShapeOutput); + auto lengthIn = ConvertDimShapeToLength(fShapeInput); + if (lengthOut != lengthIn) { + out << SP << "if (" << lengthOut << " != " << lengthIn << ")\n"; + out << SP << SP << "throw std::runtime_error(\"TMVA SOFIE " << opType + << " Op : output length is different from input length\");\n"; + } + } + + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNOutput + << ", deviceBuf_" << fNData << ");\n"; + out << SP << "alpaka::wait(queue);\n"; + + return out.str(); +} + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx index 6951017..5b17a79 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx @@ -168,6 +168,114 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) { + throw std::runtime_error("TMVA SOFIE ScatterElements Op called to Generate without being initialized first"); + } + + const std::size_t D = fShapeI.size(); + + auto strideY = UTILITY::ComputeStrideFromShape(fShapeY); + auto strideI = UTILITY::ComputeStrideFromShape(fShapeI); + + std::size_t totalElements = 1; + for (std::size_t d = 0; d < D; ++d) + totalElements *= fShapeI[d]; + + std::string op; + op = "\n//------ SCATTERELEMENTS_KERNEL_ALPAKA\n"; + op += SP + "struct ScatterElementsKernel_" + opName + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T* Y,\n"; + op += SP + SP + SP + "int64_t const* I,\n"; + op += SP + SP + SP + "T const* U,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + op += SP + SP + SP + SP + "std::size_t remaining = elem_idx;\n"; + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const idx_" + std::to_string(d) + + " = remaining / " + strideI[d] + ";\n"; + op += SP + SP + SP + SP + "remaining -= idx_" + std::to_string(d) + + " * " + strideI[d] + ";\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "int64_t iAxis = I[elem_idx];\n"; + op += SP + SP + SP + SP + "if (iAxis < 0) iAxis += " + std::to_string(fShapeY[fAxis]) + ";\n\n"; + + op += SP + SP + SP + SP + "std::size_t const out_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string coord = (d == (std::size_t)fAxis) + ? "static_cast(iAxis)" + : "idx_" + std::to_string(d); + op += SP + SP + SP + SP + SP + coord + " * " + std::to_string(strideY[d]); + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + if (fReduction.empty() || fReduction == "none") { + op += SP + SP + SP + SP + "Y[out_idx] = U[elem_idx];\n"; + } else if (fReduction == "add") { + op += SP + SP + SP + SP + "alpaka::atomicAdd(acc, &Y[out_idx], U[elem_idx]);\n"; + } else if (fReduction == "mul") { + op += SP + SP + SP + SP + "alpaka::atomicMul(acc, &Y[out_idx], U[elem_idx]);\n"; + } else if (fReduction == "max") { + op += SP + SP + SP + SP + "alpaka::atomicMax(acc, &Y[out_idx], U[elem_idx]);\n"; + } else if (fReduction == "min") { + op += SP + SP + SP + SP + "alpaka::atomicMin(acc, &Y[out_idx], U[elem_idx]);\n"; + } + + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + + return op; + } + +std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + return SP + "ScatterElementsKernel_" + opName + " scatterElementsKernel_" + opName + ";\n"; +} + +std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeY.empty()) { + throw std::runtime_error("TMVA SOFIE ScatterElements Op called to Generate without being initialized first"); + } + + std::size_t totalElements = ConvertShapeToLength(fShapeI); + + std::stringstream out; + out << "\n//------ SCATTERELEMENTS_GPU_ALPAKA\n"; + + out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY << ", deviceBuf_" << fNX << ");\n"; + out << SP << "alpaka::wait(queue);\n\n"; + + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "alpaka::KernelCfg const kernelCfg_" << opName << " = {elementsPerGrid_" << opName << ", elementsPerThread_" << opName << "};\n"; + out << SP << "auto const workDiv_" << opName << " = alpaka::getValidWorkDiv(kernelCfg_" << opName << ", devAcc, scatterElementsKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNI << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNU << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", scatterElementsKernel_" << opName + << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNI << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNU << ")" + << ", static_cast(" << totalElements << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + return out.str(); +} }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx index 52bdeae..34e69eb 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx @@ -101,6 +101,26 @@ public: return out.str(); } + std::string Generate_GPU_ALPAKA(std::string OpName) override { + // no need to generate code if the output is constant + if (fIsOutputConstant) return ""; + + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Shape op called to Generate without being initialized first"); + } + std::stringstream out; + + out << "\n//------ Shape\n"; + // add a dummy statement to avoid warning for unused input + out << SP << "(void) deviceBuf_" << fNX << ";\n"; + size_t length = ConvertShapeToLength(fOutput_shape); + for (size_t id = 0; id < length; id++) { + out << SP << "deviceBuf_" << fNY << "["<< id << "] = " << fShape[fStart+id] << ";\n"; + } + return out.str(); + } + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx index 68edd01..77f989c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx @@ -23,6 +23,7 @@ public: ROperator_Sigmoid(){} ROperator_Sigmoid(std::string nameX, std::string nameY): fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){ + fKind = OperatorKind::SIGMOID; fInputTensorNames = { fNX }; fOutputTensorNames = { fNY }; } @@ -61,6 +62,60 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override { + std::string op; + op = "\n//------ SIGMOID_KERNEL_ALPAKA\n"; + op += "struct SigmoidKernel {\n"; + op += SP + "template\n"; + op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n"; + op += SP + SP + "const auto idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + "if(idx < numElements) {\n"; + op += SP + SP + SP + SP + "out[idx] = static_cast(1) / (static_cast(1) + exp(-data[idx]));\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override { + return SP + "SigmoidKernel sigmoidKernel;\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + OpName = "op_" + OpName; + if (fShape.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Sigmoid called to Generate without being initialized first"); + } + + std::stringstream out; + auto length = ConvertShapeToLength(fShape); + out << "\n//------ SIGMOID_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"< const kernelCfg_" << fNX << " = {elementsPerGrid_" << fNX << ", elementsPerThread_" << fNX << "};\n"; + out << SP << "auto const workDiv_" << fNX << " = alpaka::getValidWorkDiv(kernelCfg_" << fNX << ", devAcc, sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNX + << ", sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX + << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast(" << length << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + return out.str(); + } + + std::string GetFusableOutputTensorName() override { + return fNY; + } + + void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function& removal_func){ + removal_func(fNX); + removal_func(fNY); + fNX = fusable_tensor_name; + fNY = fusable_tensor_name; + fInputTensorNames[0] = fNX; + fOutputTensorNames[0] = fNY; + } + std::vector GetStdLibs() override { return { std::string("cmath") };} }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx index 63fbcb3..c9af13e 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx @@ -153,6 +153,111 @@ public: return out.str(); } +std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fOutputShapes.empty()) + throw std::runtime_error("TMVA SOFIE Operator Split called to Generate without being initialized first"); + + const std::size_t D = fInputShape.size(); + const std::size_t Nin = fNYs.size(); + + auto inputStrides = UTILITY::ComputeStrideFromShape(fInputShape); + + std::string op; + op = "\n//------ SPLIT_KERNEL_ALPAKA\n"; + std::cout<<"Generating GPU kernel for Split operator with input shape "<< ConvertShapeToString(fInputShape) << " and output shapes : "; + for (std::size_t i = 0; i < Nin; ++i) { + std::cout<<"Loop running for output "<\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* input,\n"; + op += SP + SP + SP + "T* output,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % " + + std::to_string(fOutputShapes[i][d]) + "u;\n"; + } + op += "\n"; + + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + std::string coord = (d == static_cast(fAxis)) + ? ("(out_" + std::to_string(d) + " + " + std::to_string(axis_offset) + "u)") + : ("out_" + std::to_string(d)); + op += SP + SP + SP + SP + SP + coord + " * " + std::to_string(inputStrides[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; + } + + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n\n"; + } + std::cout<<"Finished generating GPU kernel for Split operator "<(1));\n"; + out << SP << SP << "auto const elementsPerGrid_" << i << " = Vec::all(Idx{" << length << "});\n"; + out << SP << SP << "alpaka::KernelCfg const kernelCfg_" << i + << " = {elementsPerGrid_" << i << ", elementsPerThread_" << i << "};\n"; + out << SP << SP << "auto const workDiv_" << i << " = alpaka::getValidWorkDiv(kernelCfg_" << i + << ", devAcc, " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNYs[i] << ")" + << ", static_cast(" << length << "));\n"; + out << SP << SP << "alpaka::exec(queue, workDiv_" << i + << ", " << kname + << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")" + << ", alpaka::getPtrNative(deviceBuf_" << fNYs[i] << ")" + << ", static_cast(" << length << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + out << SP << "}\n"; + } + return out.str(); +} + }; }//SOFIE diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx index 354fbe3..608308c 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx @@ -19,14 +19,17 @@ private: std::string fNRepeats; std::string fNInput; std::string fNY; - std::vectorfShapeInput; + std::vector fShapeInput; std::vector fShapeY; + std::vector fRepeats; public: ROperator_Tile(){} ROperator_Tile(std::string nameRepeat, std::string nameInput, std::string nameY): - fNRepeats(UTILITY::Clean_name(nameRepeat)),fNInput(UTILITY::Clean_name(nameInput)), fNY(UTILITY::Clean_name(nameY)){ - fInputTensorNames = { fNRepeats, fNInput }; + fNRepeats(UTILITY::Clean_name(nameRepeat)), + fNInput(UTILITY::Clean_name(nameInput)), + fNY(UTILITY::Clean_name(nameY)) { + fInputTensorNames = { fNRepeats, fNInput }; fOutputTensorNames = { fNY }; } @@ -36,114 +39,214 @@ public: std::vector> ShapeInference(std::vector> input) override { std::vector ret = input[0]; - - for(size_t i=0; i < input[1].size(); i++) { - ret[i]=ret[i]*input[1][i]; - } + for (size_t i = 0; i < input[1].size(); i++) + ret[i] = ret[i] * input[1][i]; return {ret}; } void Initialize(RModel& model) override { - //input must be a graph input, or already initialized intermediate tensor - if (model.CheckIfTensorAlreadyExist(fNInput) == false){ - throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model"); - } - if (model.CheckIfTensorAlreadyExist(fNRepeats) == false){ - throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model"); - } - fShapeInput=model.GetTensorShape(fNInput); + if (model.CheckIfTensorAlreadyExist(fNInput) == false) + throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model"); + if (model.CheckIfTensorAlreadyExist(fNRepeats) == false) + throw std::runtime_error("TMVA SOFIE Tile Op Repeats Tensor is not found in model"); - // if repeats vector is not initialized we cannot deduce shape of output - // not support for time being this case - if (!model.IsInitializedTensor(fNRepeats)) { + fShapeInput = model.GetTensorShape(fNInput); + + if (!model.IsInitializedTensor(fNRepeats)) throw std::runtime_error("TMVA SOFIE Tile Op: non-initialized repeats input is not supported"); - } - // Retrieve the data pointer for the repeats tensor - auto repptr = model.GetInitializedTensorData(fNRepeats); - // Cast the raw pointer to the appropriate type (size_t*) + auto repptr = model.GetInitializedTensorData(fNRepeats); auto repeats_data = static_cast(repptr.get()); - if (repeats_data == nullptr) { - throw std::runtime_error("Failed to retrieve the data for the repeats tensor."); - } - // Get the shape of the repeats tensor to determine the number of elements + if (repeats_data == nullptr) + throw std::runtime_error("TMVA SOFIE Tile Op: failed to retrieve repeats tensor data"); + auto repeats_shape = model.GetTensorShape(fNRepeats); - // Ensure the repeats tensor is 1D and get the number of elements - if (repeats_shape.size() != 1) { - throw std::runtime_error("Repeats tensor is not 1D."); - } - size_t num_elements = repeats_shape[0]; - // Convert the data to a vector of size_t - std::vector repeats_vector(num_elements); - std::copy(repeats_data, repeats_data + num_elements, repeats_vector.begin()); + if (repeats_shape.size() != 1) + throw std::runtime_error("TMVA SOFIE Tile Op: repeats tensor must be 1D"); + size_t num_elements = repeats_shape[0]; - fShapeY = ShapeInference({fShapeInput,repeats_vector})[0]; + // Save repeats if known at generation time so the GPU kernel can bake + // fShapeInput[d] directly without needing a runtime repeats pointer. + // fRepeats is left empty if repeats are not initialized (future case), + // which will cause the kernel to use the runtime repeats pointer path. + fRepeats.resize(num_elements); + std::copy(repeats_data, repeats_data + num_elements, fRepeats.begin()); + if (fRepeats.size()){ + model.RemoveInitializedTensor(fNRepeats); + } + fShapeY = ShapeInference({fShapeInput, fRepeats})[0]; model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY); if (model.Verbose()) - std::cout << "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) - << " given repeats " << ConvertShapeToString(repeats_vector) << std::endl; + std::cout << "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) + << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY) + << " given repeats " << ConvertShapeToString(fRepeats) << std::endl; } std::string Generate(std::string OpName) override { OpName = "op_" + OpName; - if (fShapeInput.empty() || fShapeY.empty()) { - throw std::runtime_error("TMVA SOFIE Tile Op called to Generate without being initialized first"); + if (fShapeInput.empty() || fShapeY.empty()) + throw std::runtime_error("TMVA SOFIE Tile Op called to Generate without being initialized first"); + + std::stringstream out; + std::string input = "tensor_" + fNInput; + std::string output = "tensor_" + fNY; + std::string repeats = "tensor_" + fNRepeats; + + out << "///-------- Tile operator\n"; + out << "{\n"; + + out << SP << "const int input_shape[" << fShapeInput.size() << "] = {"; + for (size_t i = 0; i < fShapeInput.size(); ++i) { + if (i > 0) out << ", "; + out << fShapeInput[i]; + } + out << "};\n"; + + out << SP << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n"; + out << SP << "int s = 1;\n"; + + // Read repeats from the tensor at runtime so the generated code remains + // correct even if repeats become a runtime input/intermediate in the future + out << SP << "for (int i = " << fShapeInput.size() - 1 << "; i >= 0; i--) {\n"; + out << SP << SP << "int r = " << repeats << "[i];\n"; + out << SP << SP << "int i_offset = 0, o_offset = 0;\n"; + out << SP << SP << "s = s * input_shape[i];\n"; + out << SP << SP << "if (i == " << fShapeInput.size() - 1 << ") {\n"; + out << SP << SP << SP << "for (int j = 0; j < inputLength / s; j++) {\n"; + out << SP << SP << SP << SP << "for (int k = 0; k < r; k++) {\n"; + out << SP << SP << SP << SP << SP << "std::copy(" << input << " + i_offset, " + << input << " + i_offset + s, " + << output << " + o_offset);\n"; + out << SP << SP << SP << SP << SP << "o_offset += s;\n"; + out << SP << SP << SP << SP << "}\n"; + out << SP << SP << SP << SP << "i_offset += s;\n"; + out << SP << SP << SP << "}\n"; + out << SP << SP << "} else {\n"; + out << SP << SP << SP << "for (int j = inputLength / s - 1; j >= 0; j--) {\n"; + out << SP << SP << SP << SP << "o_offset = j * s * r;\n"; + out << SP << SP << SP << SP << "i_offset = j * s;\n"; + out << SP << SP << SP << SP << "for (int k = 0; k < r; k++) {\n"; + out << SP << SP << SP << SP << SP << "std::copy(" << output << " + i_offset, " + << output << " + i_offset + s, " + << output << " + o_offset);\n"; + out << SP << SP << SP << SP << SP << "o_offset += s;\n"; + out << SP << SP << SP << SP << "}\n"; + out << SP << SP << SP << "}\n"; + out << SP << SP << "}\n"; + out << SP << SP << "s *= r;\n"; + out << SP << SP << "inputLength *= r;\n"; + out << SP << "}\n"; + out << "}\n"; + return out.str(); + } + + std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeInput.empty() || fShapeY.empty()) + throw std::runtime_error("TMVA SOFIE Operator Tile called to Generate without being initialized first"); + + const std::size_t D = fShapeInput.size(); + + auto inputStrides = UTILITY::ComputeStrideFromShape(fShapeInput); + auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeY); + std::size_t totalElements = ConvertShapeToLength(fShapeY); + + // If fRepeats is populated, repeats were known at generation time and + // we can bake fShapeInput[d] as literals — no runtime repeats pointer needed. + // If fRepeats is empty (future: runtime repeats), pass repeats as a kernel arg. + bool repeatsKnown = !fRepeats.empty(); + + std::string kname = "TileKernel_" + opName; + + std::string op; + op = "\n//------ TILE_KERNEL_ALPAKA\n"; + op += SP + "struct " + kname + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(\n"; + op += SP + SP + SP + "TAcc const& acc,\n"; + op += SP + SP + SP + "T const* __restrict__ input,\n"; + op += SP + SP + SP + "T* __restrict__ output,\n"; + if (!repeatsKnown) + op += SP + SP + SP + "int64_t const* __restrict__ repeats,\n"; + op += SP + SP + SP + "std::size_t const totalElements) const {\n\n"; + + op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n"; + op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv(acc)[0];\n\n"; + + op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n"; + + // Decompose output linear index — output strides always compile-time + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d) + + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % " + + std::to_string(fShapeY[d]) + "u;\n"; + } + op += "\n"; + + // Input index: fShapeInput[d] is always a compile-time constant since + // it is the input tensor shape, never runtime-variable. + // When repeatsKnown, we bake it directly as a literal. + // When not repeatsKnown (future), we still use fShapeInput[d] as a + // literal for the % — repeats pointer is only needed if fShapeY is dynamic. + op += SP + SP + SP + SP + "std::size_t const input_idx =\n"; + for (std::size_t d = 0; d < D; ++d) { + op += SP + SP + SP + SP + SP + + "(out_" + std::to_string(d) + " % " + std::to_string(fShapeInput[d]) + "u)" + + " * " + std::to_string(inputStrides[d]) + "u"; + op += (d + 1 < D) ? " +\n" : ";\n\n"; } - //size_t input_length = ConvertShapeToLength(fShapeInput); - //size_t output_length = ConvertShapeToLength(fShapeY); + op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + "}\n"; + op += SP + "};\n"; + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override { + opName = "op_" + opName; + std::string kname = "TileKernel_" + opName; + return SP + kname + " tileKernel_" + opName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string opName) override { + opName = "op_" + opName; + if (fShapeInput.empty() || fShapeY.empty()) + throw std::runtime_error("TMVA SOFIE Operator Tile called to Generate without being initialized first"); + + bool repeatsKnown = !fRepeats.empty(); + std::size_t totalElements = ConvertShapeToLength(fShapeY); + std::string kname = "tileKernel_" + opName; + + // Build argument list once, reused for both getValidWorkDiv and exec + std::string args = + "alpaka::getPtrNative(deviceBuf_" + fNInput + "), " + + "alpaka::getPtrNative(deviceBuf_" + fNY + ")"; + if (!repeatsKnown) + args += ", alpaka::getPtrNative(deviceBuf_" + fNRepeats + ")"; + args += ", static_cast(" + std::to_string(totalElements) + ")"; std::stringstream out; - std::string input = "tensor_" + fNInput; - std::string output = "tensor_" + fNY; - out << "///-------- Tile operator\n"; - out << "{\n"; // add scope to re-use same names - out << "const int input_shape[" << fShapeInput.size() << "] = " << ConvertShapeToString(fShapeInput) << ";\n"; - - out << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n"; - out << "int s = 1;\n"; - // loop from inverse dim order - out << "for (int i = " << fShapeInput.size()-1 << "; i >=0; i--) {\n"; - out << SP << "int r = tensor_" << fNRepeats << "[i];\n"; - // we cannot exclude case where repeats=1 since we need offset - //out << SP << "if (r == 1 && i < " << fShapeInput.size()-1 << ") continue;\n"; - out << SP << "int i_offset = 0, o_offset = 0;\n"; - out << SP << "s = s * input_shape[i];\n"; - // case we have first copy - out << SP << "if (i == " << fShapeInput.size()-1 << ") {\n"; - out << SP << SP << "for (int j = 0; j < inputLength/s ; j++) {\n"; - out << SP << SP << SP << "for (int k = 0; k < r ; k++) {\n"; - out << SP << SP << SP << SP << "std::copy(" << input << "+ i_offset, " - << input << "+ i_offset + s, " << output << "+ o_offset);\n"; - out << SP << SP << SP << SP << "o_offset += s;\n"; - out << SP << SP << SP << "}\n"; // end k loop - out << SP << SP << SP << "i_offset += s;\n"; - out << SP << SP << "}\n"; // end j loop - out << SP << "} else {\n"; // second copy we do from output to output - // and we need to loop on j from reverse order to avoir re-writing in output tensor - out << SP << SP << "for (int j = inputLength/s - 1 ; j>=0; j--) {\n"; - out << SP << SP << SP << "o_offset = j*s*r;\n"; - out << SP << SP << SP << "i_offset = j*s;\n"; - out << SP << SP << SP << "for (int k = 0; k < r ; k++) {\n"; - out << SP << SP << SP << SP << "std::copy(" << output << "+ i_offset, " - << output << "+ i_offset + s, " << output << "+ o_offset);\n"; - out << SP << SP << SP << SP << "o_offset += s;\n"; - out << SP << SP << SP << "}\n"; // end k loop - out << SP << SP << "}\n"; // end j loop - out << SP << "}\n"; // end if - out << SP << "s *= r;\n"; - out << SP << "inputLength *= r;\n"; - out << "}\n"; // end i loop - out << "}\n"; // end of scope + out << "\n//------ TILE_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast(1));\n"; + out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n"; + out << SP << "alpaka::KernelCfg const kernelCfg_" << opName + << " = {elementsPerGrid_" << opName << ", elementsPerThread_" << opName << "};\n"; + out << SP << "auto const workDiv_" << opName << " = alpaka::getValidWorkDiv(kernelCfg_" << opName + << ", devAcc, " << kname << ", " << args << ");\n"; + out << SP << "alpaka::exec(queue, workDiv_" << opName + << ", " << kname << ", " << args << ");\n"; + out << SP <<"alpaka::wait(queue);\n"; return out.str(); } + }; }//SOFIE - #endif //SOFIE_ROPERATOR_Tile diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx index 11c40bb..de33544 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx @@ -165,6 +165,62 @@ public: return out.str(); } + std::string Generate_GPU_Kernel_ALPAKA(std::string OpName) { + std::string op; + OpName = "op_" + OpName; + op = "\n//------ TRANSPOSE_KERNEL_ALPAKA\n"; + op += SP + "struct TransposeKernel_" + OpName + " {\n"; + op += SP + SP + "template\n"; + op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output,"; + op += "const std::size_t totalElements) const {\n"; + op += SP + SP + SP + SP + "auto const idx = alpaka::getIdx(acc)[0];\n"; + op += SP + SP + SP + SP + "if(idx >= totalElements) return;\n"; + op += SP + SP + SP + SP + "std::size_t input_idx = 0;\n"; + op += SP + SP + SP + SP + "std::size_t remaining = idx;\n"; + op += SP + SP + SP + SP + "std::size_t coord;\n"; + + auto inputStrides = UTILITY::ComputeStrideFromShape(fShapeData); + auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeOutput); + + for (size_t k = 0; k < fShapeData.size(); k++) { + op += SP + SP + SP + SP + "coord = remaining / " + + std::to_string(outputStrides[k]) + "u;\n"; + op += SP + SP + SP + SP + "remaining = remaining - coord * " + + std::to_string(outputStrides[k]) + "u;\n"; + op += SP + SP + SP + SP + "input_idx += coord * " + + std::to_string(inputStrides[fAttrPerm[k]]) + "u;\n"; + } + + op += SP + SP + SP + SP + "output[idx] = input[input_idx];\n"; + op += SP + SP + SP + "}\n"; + op += SP + SP + SP + "};\n"; + + return op; + } + + std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override { + return SP + "TransposeKernel_op_" + OpName + " transposeKernel_" + OpName + ";\n"; + } + + std::string Generate_GPU_ALPAKA(std::string OpName) override { + if (fShapeOutput.empty()) { + throw std::runtime_error("TMVA SOFIE Operator Transpose called to Generate without being initialized first"); + } + std::stringstream out; + auto length = ConvertShapeToLength(fShapeOutput); + + out << "\n//------ TRANSPOSE_GPU_ALPAKA\n"; + out << SP << "auto const elementsPerThread_"<(1));\n"; + out << SP << "auto const elementsPerGrid_"< const kernelCfg_" << fNOutput << " = {elementsPerGrid_" << fNOutput << ", elementsPerThread_" << fNOutput << "};\n"; + out << SP << "auto const workDiv_" << fNOutput << " = alpaka::getValidWorkDiv(kernelCfg_" << fNOutput << ", devAcc, transposeKernel_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fNData + << "), alpaka::getPtrNative(deviceBuf_" << fNOutput << "), static_cast(" << length << "));\n"; + out << SP << "alpaka::exec(queue, workDiv_" << fNOutput + << ", transposeKernel_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fNData + << "), alpaka::getPtrNative(deviceBuf_" << fNOutput << "), static_cast(" << length << "));\n"; + out << SP <<"alpaka::wait(queue);\n"; + return out.str(); + } }; diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx index 28ac093..2a55700 100644 --- a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx +++ b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx @@ -7,11 +7,8 @@ #include - namespace SOFIE{ - - template class ROperator_Where final : public ROperator{ private: @@ -104,7 +101,7 @@ public: if (model.IsInitializedTensor(fNA)) { auto data = model.GetInitializedTensorData(fNA); std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeA, fShapeY), std::default_delete()); // Update the data and the shape of A model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData); @@ -120,7 +117,7 @@ public: if (model.IsInitializedTensor(fNB)) { auto data = model.GetInitializedTensorData(fNB); std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeB, fShapeY), std::default_delete()); // do not update tensor B but add broadcasted one (since it can be input to some other operators) model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData); @@ -136,7 +133,7 @@ public: if (model.IsInitializedTensor(fNC)) { auto data = model.GetInitializedTensorData(fNC); std::shared_ptr broadcastedData( - UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeC, fShapeY), + UTILITY::UnidirectionalBroadcast(static_cast(data.get()), fShapeC, fShapeY), std::default_delete()); // do not update tensor C but add broadcasted one (since it can be input to some other operators) model.AddConstantTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeY, broadcastedData); @@ -150,32 +147,86 @@ public: fShapeY = fShapeA; } // check case of constant output (if all inputs are defined) - if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB) && model.IsInitializedTensor(fNC)) { - std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; - std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; + if (model.IsInitializedTensor(fNC)) { + std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC; - auto dataA = static_cast(model.GetInitializedTensorData(nameA).get()); - auto dataB = static_cast(model.GetInitializedTensorData(nameB).get()); auto dataC = static_cast(model.GetInitializedTensorData(nameC).get()); - std::vector dataY(ConvertShapeToLength(fShapeY)); - for (size_t i = 0; i < dataY.size(); i++) - dataY[i] = (dataC[i]) ? dataA[i] : dataB[i]; - model.AddConstantTensor(fNY, fShapeY, dataY.data()); - // flag tensors to not be written in a file - model.SetNotWritableInitializedTensor(nameA); - model.SetNotWritableInitializedTensor(nameB); model.SetNotWritableInitializedTensor(nameC); + T * dataA = nullptr; + T * dataB = nullptr; + std::vector shapeDataA; + std::vector shapeDataB; + if (model.IsInitializedTensor(fNA)) { + std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; + dataA = static_cast(model.GetInitializedTensorData(nameA).get()); + // flag tensors to not be written in a file + model.SetNotWritableInitializedTensor(nameA); + } else if (model.IsShapeTensor(fNA)) + shapeDataA = model.GetShapeTensorValues(fNA); + if (model.IsInitializedTensor(fNB)) { + std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; + dataB = static_cast(model.GetInitializedTensorData(nameB).get()); + model.SetNotWritableInitializedTensor(nameB); + } else if (model.IsShapeTensor(fNB)) + shapeDataB = model.GetShapeTensorValues(fNB); - fIsOutputConstant = true; - if (model.Verbose()) + std::vector dataY; + std::vector shapeDataY; + + bool isOutputConstantTensor = true; + if (dataA && dataB) { + dataY.resize(ConvertShapeToLength(fShapeY)); + for (size_t i = 0; i < dataY.size(); i++) + dataY[i] = (dataC[i]) ? dataA[i] : dataB[i]; + } + else if (dataA && shapeDataB.size()>0 ) { + shapeDataY.resize(ConvertShapeToLength(fShapeY)); + for (size_t i = 0; i < shapeDataY.size(); i++) { + shapeDataY[i] = (dataC[i]) ? Dim{size_t(dataA[i])} : shapeDataB[i]; + isOutputConstantTensor &= !shapeDataY[i].isParam; + } + } + else if (dataB && shapeDataA.size()>0 ) { + shapeDataY.resize(ConvertShapeToLength(fShapeY)); + for (size_t i = 0; i < shapeDataY.size(); i++) { + shapeDataY[i] = (dataC[i]) ? shapeDataB[i] : Dim{size_t(dataB[i])}; + isOutputConstantTensor &= !shapeDataY[i].isParam; + } + } + else if (shapeDataB.size() > 0 && shapeDataA.size()>0 ) { + shapeDataY.resize(ConvertShapeToLength(fShapeY)); + for (size_t i = 0; i < shapeDataY.size(); i++) { + shapeDataY[i] = (dataC[i]) ? shapeDataA[i] : shapeDataB[i]; + isOutputConstantTensor &= !shapeDataY[i].isParam; + } + } + fIsOutputConstant = true; // this contains both case constant tensor output ans shape tensor output + if (isOutputConstantTensor && dataY.empty()) { + dataY.resize(shapeDataY.size()); + for (size_t i = 0; i < shapeDataY.size(); i++) + dataY[i] = static_cast(shapeDataY[i].dim); + } + if (dataY.size() > 0) + model.AddConstantTensor(fNY, fShapeY, dataY.data()); + else if (shapeDataY.size() > 0 ) + model.AddShapeTensor(fNY, shapeDataY, fShapeY.size() == 0); + else { + fIsOutputConstant = false; + } + if (fIsOutputConstant && model.Verbose()) std::cout << "Where op ---> " << fNY << " " << ConvertShapeToString(fShapeY) << " : " - << ConvertValuesToString(dataY) << std::endl; - + << ((dataY.size() > 0) ? ConvertValuesToString(dataY) : ConvertDimShapeToString(shapeDataY) ) + << ((dataY.size() > 0) ? " (constant)" : " (shape)") << std::endl; + // output is a constant tensor - fOutputTensorNames.pop_back(); + if (fIsOutputConstant) fOutputTensorNames.pop_back(); } - else { + if (!fIsOutputConstant) { model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY); + if (model.Verbose()) + std::cout << "Where op " << " condition : " << fNC << " " << ConvertShapeToString(fShapeC) << + " X " << fNA << " " << ConvertShapeToString(fShapeA) << " Y " << fNB << " " << ConvertShapeToString(fShapeB) + << " ---> " << fNY << " " << ConvertShapeToString(fShapeY) << std::endl; } } @@ -184,52 +235,51 @@ public: return out.str(); } - std::string Generate(std::string OpName) override { + std::string Generate(std::string opName) override { if (fIsOutputConstant) return ""; - OpName = "op_" + OpName; + opName = "op_" + opName; if (fShapeY.empty()) { throw std::runtime_error("TMVA SOFIE Where Op called to Generate without being initialized first"); } std::stringstream out; - out << SP << "\n//-------- Where \n"; + out << SP << "\n//-------- Where " << opName << " --> " << ConvertShapeToString(fShapeY) << "\n"; size_t length = ConvertShapeToLength(fShapeY); std::string typeName = TensorType::Name(); // Broadcast A if it's uninitialized if (fShapeA != fShapeY) { out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n"; //out << SP << "{\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedA << ");\n"; + out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY) + << ", tensor_" << fNBroadcastedA << ");\n"; } // Broadcast B if it's uninitialized if (fShapeB != fShapeY) { out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n"; //out << SP << "{\n"; - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedB << ");\n"; + out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY) + << ", tensor_" << fNBroadcastedB << ");\n"; } // Broadcast C if it's uninitialized if (fShapeC != fShapeY) { // special case if C is an input tensor if (fIsInputBoolTensor) { size_t inputLength = ConvertShapeToLength(fShapeC); - out << SP << "std::vector fTensor_" << fNC << "(tensor_" << fNC << ", tensor_" << fNC << " + " << inputLength << ");\n"; + out << SP << "std::vector tmp_tensor_" << fNC << "(tensor_" << fNC << ", tensor_" << fNC << " + " << inputLength << ");\n"; } out << SP << "// Broadcasting uninitialized tensor " << fNC << "\n"; //out << SP << "{\n"; - // for boolean we need to pass vector and use the non-template version of the function - out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(fTensor_" << fNC << ", " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) - << ", fTensor_" << fNBroadcastedC << ");\n"; + out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tmp_tensor_" << fNC << ".data(), " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY) + << ", tensor_" << fNBroadcastedC << ");\n"; } std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA; std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB; std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC; out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n"; - // get output tensor applying condition (note we need to use directly the vector since v.data(), i.e the data pointer, does not exist) - out << SP << SP << "tensor_" << fNY << "[id] = " << "(fTensor_" << nameC << "[id]) ? tensor_" + // get output tensor applying condition + out << SP << SP << "tensor_" << fNY << "[id] = " << "tensor_" << nameC << "[id] ? tensor_" << nameA << "[id] : tensor_" + nameB + "[id];\n"; out << SP << "}\n"; return out.str(); @@ -239,5 +289,4 @@ public: }//SOFIE - -#endif //SOFIE_ROperator_Where +#endif //TMVA_SOFIE_ROperator_Where diff --git a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx b/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx index d183052..d59eee8 100644 --- a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx +++ b/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx @@ -1,7 +1,7 @@ #ifndef SOFIE_SOFIE_COMMON #define SOFIE_SOFIE_COMMON -#include "TMVA/RTensor.hxx" +#include "SOFIE/RTensor.hxx" #include "ROOT/RSpan.hxx" @@ -21,13 +21,10 @@ #include #include - -namespace SOFIE{ - -//typedef RTensor tensor_t; +namespace SOFIE { enum class ETensorType{ - UNDEFINED = 0, FLOAT = 1, UNINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive + UNDEFINED = 0, FLOAT = 1, UINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive FLOAT16 = 10, DOUBLE = 11, UINT32 = 12, UINT64 = 13, COMPLEX64 = 14, COMPLEX28 = 15, BFLOAT16 = 16 }; @@ -39,7 +36,7 @@ constexpr size_t GetTypeSize(ETensorType type) { switch (type) { case ETensorType::FLOAT: return sizeof(float); case ETensorType::DOUBLE: return sizeof(double); - case ETensorType::UNINT8: return sizeof(uint8_t); + case ETensorType::UINT8: return sizeof(uint8_t); case ETensorType::INT8: return sizeof(int8_t); case ETensorType::UINT16: return sizeof(uint16_t); case ETensorType::INT16: return sizeof(int16_t); @@ -58,6 +55,9 @@ typedef std::int64_t int_t; std::string ConvertTypeToString(ETensorType type); ETensorType ConvertStringToType(std::string type); +// find if a string represents a number +bool IsInteger(const std::string & s); + struct Dim{ bool isParam = false; size_t dim = 0; @@ -67,16 +67,42 @@ struct Dim{ Dim() {} // constructor for a parametric dimension with the option to pass a default dim value - Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) {} + // We use -1 for dim to indicate that the param dimension is an expression (e.g. "d1+d2") + // in case the string represents a number make Dim not parametric + Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) + { + if (IsInteger(p)) { + isParam = false; + dim = std::stoi(p); + } + } // constructor for a non-parametric dimension Dim(size_t d) : dim(d) {} std::string GetVal() const { - return (isParam) ? param : std::to_string(dim); + // cast to int64_t for negative shape values + return (isParam) ? param : std::to_string(static_cast(dim)); + } + + std::ostream& operator<< (std::ostream& os) const { + os << GetVal(); + return os; + } + + bool operator==(const Dim& rhs) const { + return (isParam && rhs.isParam) ? param == rhs.param : dim == rhs.dim; + } + bool operator!=(const Dim& rhs) const { + return !(*this == rhs); } }; +//bool operator==(const Dim& lhs, const Dim& rhs); +inline std::ostream & operator<< (std::ostream &os, const Dim &d) { + os << d.GetVal(); + return os; +} struct InputTensorInfo{ ETensorType type; @@ -93,6 +119,18 @@ struct DynamicTensorInfo{ std::vector shape; }; +// template traits for Tensor Shape +template +struct TensorShape {}; +template<> +struct TensorShape { + static bool IsDim() { return true; } +}; +template<> +struct TensorShape { + static bool IsDim() { return false; } +}; + // template traits for Tensor type template struct TensorType {}; @@ -120,6 +158,18 @@ template<> struct TensorType { static const std::string Name() { return "uint64_t"; } }; +template<> +struct TensorType { + static const std::string Name() { return "bool"; } +}; +template<> +struct TensorType { + static const std::string Name() { return "int8_t"; } +}; +template<> +struct TensorType { + static const std::string Name() { return "uint8_t"; } +}; struct TensorMemoryInfo { std::string_view tensor_name; @@ -148,19 +198,17 @@ struct MemoryPoolInfo { std::map available_stack; }; -std::vector ConvertShapeToDim(std::vector shape); +std::vector ConvertShapeToDim(const std::vector & shape); + +std::vector ConvertShapeToInt(const std::vector & shape); -std::vector ConvertShapeToInt(std::vector shape); +std::size_t ConvertShapeToLength(const std::vector & shape); -std::size_t ConvertShapeToLength(std::vector shape); +std::string ConvertShapeToString(const std::vector & shape); +std::string ConvertDimShapeToString(const std::vector & shape); -std::string ConvertShapeToString(std::vector shape); -std::string ConvertDynamicShapeToString(std::vector shape); -// std::string ConvertShapeToString(std::vector shape) { -// return ConvertDynamicShapeToString(shape); -// } +std::string ConvertDimShapeToLength(const std::vector & shape); -std::string ConvertDynamicShapeToLength(std::vector shape); template std::string ConvertValToString(T value) { @@ -179,8 +227,11 @@ std::string ConvertValuesToString(size_t n, const T * data) { ret << "{ "; for (size_t i = 0; i < n; i++) { if (std::is_floating_point_v) - ret << std::setprecision(std::numeric_limits::max_digits10); - ret << data[i]; + ret << std::setprecision(std::numeric_limits::max_digits10) << data[i]; + else + // cast in case of boolean (int8) + ret << (int64_t) data[i]; + if (i < n-1) ret << ", "; } ret << "}"; @@ -206,8 +257,14 @@ public: bool IsConstantTensor() const { return fConstant;} // query if tensor needs to be written in a weight file. Constant tensors are not written in a file bool IsWeightTensor() const { return !fConstant && !fIsNotWritable;} + // check if a Tensor is Writable (need to be written in the file or in the generated code (e.g. as a constant tensor) + // if an initialized tensors is used in a constant operator at compile time does not need to be written and can be omitted in + // the generated code + bool IsNotWritable() const { return fIsNotWritable; } // set not writable initialized tensors - i.e. tensor that must not be written in a file void SetNotWritable() { fIsNotWritable = true;} + // set as constant (needed for non-float initialized tensors) + void SetConstant() { fConstant = true;} template T const *data() const @@ -230,7 +287,7 @@ public: case ETensorType::INT64: fSize *= sizeof(int64_t); break; case ETensorType::BOOL: fSize *= sizeof(bool); break; default: - throw std::runtime_error("TMVA::SOFIE doesn't yet supports serialising data-type " + + throw std::runtime_error("SOFIE doesn't yet supports serialising data-type " + ConvertTypeToString(fType)); } fPersistentData = static_cast(fData.get()); @@ -271,7 +328,7 @@ private: template ETensorType GetTemplatedType(T /*obj*/ ){ if (std::is_same::value) return ETensorType::FLOAT; - if (std::is_same::value) return ETensorType::UNINT8; + if (std::is_same::value) return ETensorType::UINT8; if (std::is_same::value) return ETensorType::INT8; if (std::is_same::value) return ETensorType::UINT16; if (std::is_same::value) return ETensorType::INT16; @@ -287,6 +344,12 @@ ETensorType GetTemplatedType(T /*obj*/ ){ } namespace UTILITY{ + + + +// clean operator and tensor names +std::string Clean_name(std::string input_tensor_name); + // Check if two shapes are equal bool AreSameShape(const std::vector&, const std::vector&); bool AreSameShape(const std::vector&, const std::vector&); @@ -296,17 +359,21 @@ bool AreSameShape(const std::vector&, const std::vector&); // Multidirectional broadcast a list of tensors to the same shape std::vector MultidirectionalBroadcastShape(std::vector>); -// Unidirectional broadcast two shapes to the same shape -std::vector UnidirectionalBroadcastShape(std::vector, std::vector); +// Multidirectional broadcast two shapes to the same shape + +std::pair> MultidirectionalBroadcastShape(std::vector &, std::vector &); +std::vector UnidirectionalBroadcastShape(std::vector &, std::vector &); + +std::pair> MultidirectionalBroadcastShape(std::vector &, std::vector &); + -std::string Clean_name(std::string input_tensor_name); template T* BroadcastConvBias(const T* data, const size_t channel, const std::vector& targetShape) { size_t size = targetShape.size(); if (targetShape[1] != channel) { std::stringstream ss; - ss << "TMVA::SOFIE - Error broadcasting Conv Bias of shape {"; + ss << "SOFIE - Error broadcasting Conv Bias of shape {"; ss << std::to_string(channel); ss << "} to "; ss << ConvertShapeToString(targetShape); @@ -343,16 +410,14 @@ T* BroadcastConvBias(const T* data, const size_t channel, const std::vector, class ContT = std::span > -void BroadcastTensor(ConstContT data, const std::vector& shape, const std::vector& targetShape, ContT broadcastedData) { +template> +void BroadcastTensor(ConstContT data, const std::vector& shape, const std::vector& targetShape, T *broadcastedData) { // Size of the shapes (tensor input here have shapes with same sizes, we have already added the needed ones ) size_t size = shape.size(); // Current length of the broadcasted tensor size_t curLength = data.size(); - size_t targetLength = broadcastedData.size(); - assert(ConvertShapeToLength(targetShape) == targetLength); // special case when broadcasting last dimensions (initial shapes must be the same) - if (shape.front() == targetShape.front() && shape.back() == 1 && size > 1) { + if (size > 1 && shape.front() == targetShape.front() && shape.back() == 1) { size_t bsize = targetShape.back(); // compute the size of the data to broadcast for (int k = int(size)-2; k >=0; k--) { @@ -360,16 +425,16 @@ void BroadcastTensor(ConstContT data, const std::vector& shape, const st bsize *= targetShape[k]; } for (size_t i = 0; i < curLength; i++) { - std::fill(broadcastedData.begin() + i*bsize, broadcastedData.begin() + (i+1)*bsize , data[i]); + std::fill(broadcastedData + i*bsize, broadcastedData + (i+1)*bsize , data[i]); } return; } - std::copy(data.begin(), data.end(), broadcastedData.begin()); + std::copy(data.begin(), data.end(), broadcastedData); // Product of the previous dimensions of targetShape size_t arrayNum = 1; // New broadcasted data: is this needed? - std::vector newData(targetLength); + std::vector newData(ConvertShapeToLength(targetShape)); for (size_t idx = 0; idx < size; idx++) { size_t dim = shape[idx]; @@ -385,8 +450,8 @@ void BroadcastTensor(ConstContT data, const std::vector& shape, const st for (size_t arrayIdx = 0; arrayIdx < arrayNum; arrayIdx++) { for (size_t targetIdx = 0; targetIdx < targetDim; targetIdx++) { size_t offset = arrayIdx * arrayLength * targetDim + targetIdx * arrayLength; - std::copy(broadcastedData.begin() + arrayIdx * arrayLength, - broadcastedData.begin() + (arrayIdx + 1) * arrayLength, + std::copy(broadcastedData + arrayIdx * arrayLength, + broadcastedData + (arrayIdx + 1) * arrayLength, newData.begin() + offset); } } @@ -400,12 +465,11 @@ void BroadcastTensor(ConstContT data, const std::vector& shape, const st // Update current length curLength = newLength; // Update broadcasted data - std::copy(newData.begin(), newData.begin() + newLength, broadcastedData.begin()); + std::copy(newData.begin(), newData.begin() + newLength, broadcastedData); } // Update the number of arrays arrayNum *= targetDim; } - //return broadcastedData; } // interface where we allocate a new array for broadcasted data @@ -413,10 +477,8 @@ template T* CreateBroadcastTensor(const T* data, const std::vector& shape, const std::vector& targetShape, size_t targetLength) { // newShape is an array of size equal to dimension along which we are broadcasting the tensor T* broadcastedData = new T[targetLength]; - std::span bData(broadcastedData, broadcastedData+targetLength); size_t curLength = ConvertShapeToLength(shape); - std::span inData(data, curLength); - BroadcastTensor, std::span>(inData, shape, targetShape, bData); + BroadcastTensor({data, curLength}, shape, targetShape, broadcastedData); return broadcastedData; } // Unidirectional broadcasting shape to targetShape// In unidirectional broadcast - only tensor B can have the shape changed not @@ -429,14 +491,14 @@ T* UnidirectionalBroadcast(const T* data, const std::vector& shape, cons std::vector newShape(targetSize, 1); size_t offset = targetSize - shape.size(); std::copy(shape.begin(), shape.end(), newShape.begin() + offset); - return CreateBroadcastTensor(data, newShape, targetShape, ConvertShapeToLength(targetShape)); + return CreateBroadcastTensor(data, newShape, targetShape, ConvertShapeToLength(targetShape)); } - return CreateBroadcastTensor(data, shape, targetShape, ConvertShapeToLength(targetShape)); + return CreateBroadcastTensor(data, shape, targetShape, ConvertShapeToLength(targetShape)); } // Unidirectional broadcasting shape to targetShape using a passed vector to avoid allocations template -void UnidirectionalBroadcast(const T* data, const std::vector& shape, const std::vector& targetShape, std::span broadcastedData) { +void UnidirectionalBroadcast(const T* data, const std::vector& shape, const std::vector& targetShape, T *broadcastedData) { size_t curLength = ConvertShapeToLength(shape); std::span inData(const_cast(data), curLength); // Prepend shape with ones @@ -445,12 +507,10 @@ void UnidirectionalBroadcast(const T* data, const std::vector& shape, co std::vector newShape(targetSize, 1); size_t offset = targetSize - shape.size(); std::copy(shape.begin(), shape.end(), newShape.begin() + offset); - BroadcastTensor(inData, newShape, targetShape, broadcastedData); + BroadcastTensor(inData, newShape, targetShape, broadcastedData); } - BroadcastTensor>(inData, shape, targetShape, broadcastedData); + BroadcastTensor(inData, shape, targetShape, broadcastedData); } -// specialization for vector of boolean -void UnidirectionalBroadcast(const std::vector & data, const std::vector& shape, const std::vector& targetShape, std::vector & broadcastedData); /// compute stride of a tensor given its shape (assume layout is row-major) std::vector ComputeStrideFromShape(const std::vector & shape); @@ -619,7 +679,15 @@ void col2im(const Dtype* data_col, const int channels, //std::cout << "finishing col2imp" << std::endl; } - +// Used at the end of infer() to fill the return object. +template +void FillOutput(T const *arr, std::vector &out, std::size_t n) +{ + out.resize(n); + for (std::size_t i = 0; i < n; ++i) { + out[i] = arr[i]; + } +} } // end namespace UTILITY @@ -631,20 +699,20 @@ extern "C" void sgemm_(const char * transa, const char * transb, const int * m, struct GNN_Data { - TMVA::Experimental::RTensor node_data; // the node feature data, tensor with shape (num_nodes, num_node_features) - TMVA::Experimental::RTensor edge_data; // the edge feature data, tensor with shape (num_edges, num_edge_features) - TMVA::Experimental::RTensor global_data; // the global features, tensor with shape (1, num_global_features) - TMVA::Experimental::RTensor edge_index; // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges) + RTensor node_data; // the node feature data, tensor with shape (num_nodes, num_node_features) + RTensor edge_data; // the edge feature data, tensor with shape (num_edges, num_edge_features) + RTensor global_data; // the global features, tensor with shape (1, num_global_features) + RTensor edge_index; // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges) // edge_index[0,:] are the receivers and edge_index[1,:] are the senders // need to have default constructor since RTensor has not one - GNN_Data(): node_data(TMVA::Experimental::RTensor({})), edge_data(TMVA::Experimental::RTensor({})), global_data(TMVA::Experimental::RTensor({})), edge_index(TMVA::Experimental::RTensor({})) {} + GNN_Data(): node_data(RTensor({})), edge_data(RTensor({})), global_data(RTensor({})), edge_index(RTensor({})) {} }; template -TMVA::Experimental::RTensor Concatenate( TMVA::Experimental::RTensor & t1, TMVA::Experimental::RTensor & t2, int axis = 0) +RTensor Concatenate( RTensor & t1, RTensor & t2, int axis = 0) { // concatenate tensor along axis. Shape must be the same except in the dimension of the concatenated axis if (t1.GetMemoryLayout() != t2.GetMemoryLayout()) @@ -659,8 +727,8 @@ TMVA::Experimental::RTensor Concatenate( TMVA::Experimental::RTensor & t1, } std::vector outShape = shape1; outShape[axis] = shape1[axis] + shape2[axis]; - TMVA::Experimental::RTensor tout(outShape, t1.GetMemoryLayout()); - if (t1.GetMemoryLayout() == TMVA::Experimental::MemoryLayout::ColumnMajor) { + RTensor tout(outShape, t1.GetMemoryLayout()); + if (t1.GetMemoryLayout() == MemoryLayout::ColumnMajor) { throw std::runtime_error("TMVA RTensor Concatenate is not yet supported for column major tensors"); } @@ -693,10 +761,10 @@ inline GNN_Data Concatenate(GNN_Data & data1, GNN_Data & data2, int axis = 0) { inline GNN_Data Copy(const GNN_Data & data) { GNN_Data out; - out.node_data = TMVA::Experimental::RTensor(data.node_data.GetShape()); - out.edge_data = TMVA::Experimental::RTensor(data.edge_data.GetShape()); - out.global_data = TMVA::Experimental::RTensor(data.global_data.GetShape()); - out.edge_index = TMVA::Experimental::RTensor(data.edge_index.GetShape()); + out.node_data = RTensor(data.node_data.GetShape()); + out.edge_data = RTensor(data.edge_data.GetShape()); + out.global_data = RTensor(data.global_data.GetShape()); + out.edge_index = RTensor(data.edge_index.GetShape()); std::copy(data.node_data.GetData(), data.node_data.GetData()+ data.node_data.GetSize(), out.node_data.GetData()); std::copy(data.edge_data.GetData(), data.edge_data.GetData()+ data.edge_data.GetSize(), out.edge_data.GetData()); std::copy(data.global_data.GetData(), data.global_data.GetData()+ data.global_data.GetSize(), out.global_data.GetData()); @@ -704,6 +772,70 @@ inline GNN_Data Copy(const GNN_Data & data) { return out; } -}//SOFIE +inline void Gemm_Call(float *output, bool transa, bool transb, int m, int n, int k, float alpha, const float *A, + const float *B, float beta, const float *C) +{ + char ct = 't'; + char cn = 'n'; + const int *lda = transa ? &k : &m; + const int *ldb = transb ? &n : &k; + const int *ldc = &m; + if (C != nullptr) { + std::copy(C, C + m * n, output); + } + SOFIE::BLAS::sgemm_(transa ? &ct : &cn, transb ? &ct : &cn, &m, &n, &k, &alpha, A, lda, B, ldb, + &beta, output, ldc); +} + +template +void ReadTensorFromStream(std::istream &is, T &target, std::string const &expectedName, std::size_t expectedLength) +{ + std::string name; + std::size_t length; + is >> name >> length; + if (name != expectedName) { + std::string err_msg = + "TMVA-SOFIE failed to read the correct tensor name; expected name is " + expectedName + " , read " + name; + throw std::runtime_error(err_msg); + } + if (length != expectedLength) { + std::string err_msg = "TMVA-SOFIE failed to read the correct tensor size; expected size is " + + std::to_string(expectedLength) + " , read " + std::to_string(length); + throw std::runtime_error(err_msg); + } + for (size_t i = 0; i < length; ++i) { + is >> target[i]; + } + if (is.fail()) { + throw std::runtime_error("TMVA-SOFIE failed to read the values for tensor " + expectedName); + } +} + + +// code for the memory greeding allocations +struct TensorLifeInfo { + int begin; // start time (op index) lifetime + int end; // end time lifetime + size_t size; // size of tensors in bytes +}; + +struct MemoryResult { + std::size_t total_bytes = 0; // total memory needed + std::vector offsets; // resulted offsets for each tensor +}; + +/// Greedy best-fit planner with coalescing free list. +MemoryResult OrganizeMemory(const std::vector & tensorsInfo ); + + +inline std::string ConvertOutputTypeToString(ETensorType t) { + // The std::vector is a special type that is not wrapping continuous memory. + // We don't want to use it as a return type. + if (t == ETensorType::BOOL) t = ETensorType::UINT8; + return ConvertTypeToString(t); +} + + +} // namespace SOFIE -#endif //TMVA_SOFIE_RMODEL +#endif //TMVA_SOFIE_COMMON diff --git a/src/SOFIE_core/src/RModel.cxx b/src/SOFIE_core/src/RModel.cxx index e5495ed..3dd1d23 100644 --- a/src/SOFIE_core/src/RModel.cxx +++ b/src/SOFIE_core/src/RModel.cxx @@ -4,55 +4,21 @@ #include #include +#ifdef SOFIE_SUPPORT_ROOT_BINARY #include "TFile.h" +#endif #include "SOFIE/RModel.hxx" #include "SOFIE/SOFIE_common.hxx" - namespace SOFIE { -std::underlying_type_t operator|(Options opA, Options opB) { - return static_cast>(opA) | static_cast>(opB); -} -std::underlying_type_t operator|(std::underlying_type_t opA, Options opB) { - return opA | static_cast>(opB); -} - -RModel::RModel(RModel&& other) { - fInputTensorInfos = std::move(other.fInputTensorInfos); - fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos); - fOutputTensorNames = other.fOutputTensorNames; - fInputTensorNames = other.fInputTensorNames; - fOperators = std::move(other.fOperators); - fInitializedTensors = std::move(other.fInitializedTensors); - fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos); - fName = other.fName; - fFileName = other.fFileName; - fParseTime = other.fParseTime; - fGC = other.fGC; - fNeededBlasRoutines = other.fNeededBlasRoutines; - fNeededStdLib = other.fNeededStdLib; +namespace { +const std::string SP = " "; } -RModel& RModel::operator=(RModel&& other) { - fInputTensorInfos = std::move(other.fInputTensorInfos); - fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos); - fOutputTensorNames = other.fOutputTensorNames; - fInputTensorNames = other.fInputTensorNames; - fOperators = std::move(other.fOperators); - fInitializedTensors = std::move(other.fInitializedTensors); - fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos); - fName = other.fName; - fFileName = other.fFileName; - fParseTime = other.fParseTime; - fGC = other.fGC; - fNeededBlasRoutines = other.fNeededBlasRoutines; - fNeededStdLib = other.fNeededStdLib; - return *this; -} -const std::vector& RModel::GetTensorShape(std::string name) const { +const std::vector& RModel::GetTensorShape(const std::string & name) const { auto f = fReadyInputTensorInfos.find(name); if (f != fReadyInputTensorInfos.end()) { return f->second.shape; @@ -69,6 +35,16 @@ const std::vector& RModel::GetTensorShape(std::string name) const { if (f4 != fIntermediateTensorInfos.end()) { return f4->second.shape; } + // case of shape tensors + auto f5 = fShapeTensors.find(name); + if (f5 != fShapeTensors.end()) { + // shape is vector of size 1 with size of shape values or just a scalar + if (f5->second.second) // check scalar flag + return std::vector{}; + else + return std::vector{f5->second.first.size()}; + } + if (fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) throw std::runtime_error("TMVA SOFIE tensor [" + name + "] is a dynamic tensor. Use GetDynamicTensorShape instead of GetTensorShape"); @@ -78,7 +54,7 @@ const std::vector& RModel::GetTensorShape(std::string name) const { throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found"); } -std::vector RModel::GetDynamicTensorShape(std::string name) const { +std::vector RModel::GetDimTensorShape(const std::string & name) const { if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) { return f->second.shape; } @@ -89,8 +65,21 @@ std::vector RModel::GetDynamicTensorShape(std::string name) const { // for this we need to return the vector by value return ConvertShapeToDim(GetTensorShape(name)); } +std::vector RModel::GetDynamicTensorShape(const std::string & name) const { + if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) { + return f->second.shape; + } + if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) { + return f->second.shape; + } + // throw error if shape is not dynamic + if (!IsDynamicTensor(name)) + throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not dynamic"); + + throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found"); +} -const ETensorType& RModel::GetTensorType(std::string name) const { +const ETensorType& RModel::GetTensorType(const std::string & name) const { auto f = fReadyInputTensorInfos.find(name); if (f != fReadyInputTensorInfos.end()) { return f->second.type; @@ -111,6 +100,10 @@ const ETensorType& RModel::GetTensorType(std::string name) const { if (f5 != fDynamicTensorInfos.end()){ return f5->second.type; } + // case of shape tensor type is INT64 + if (fShapeTensors.find(name) != fShapeTensors.end()){ + return ETensorType::INT64; + } if (fIsSubGraph && fParentGraph) return fParentGraph->GetTensorType(name); @@ -124,6 +117,7 @@ bool RModel::CheckIfTensorAlreadyExist(std::string tensor_name) { if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()) return true; if (fIntermediateTensorInfos.find(tensor_name) != fIntermediateTensorInfos.end()) return true; if (fDynamicTensorInfos.find(tensor_name) != fDynamicTensorInfos.end()) return true; + if (fShapeTensors.find(tensor_name) != fShapeTensors.end()) return true; if (fIsSubGraph && fParentGraph) return fParentGraph->CheckIfTensorAlreadyExist(tensor_name); return false; } @@ -192,16 +186,34 @@ void RModel::AddConstantTensor(std::string tensor_name, ETensorType type, std::v tensor_name = UTILITY::Clean_name(tensor_name); //NB: own data if (CheckIfTensorAlreadyExist(tensor_name)) { - throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n"); + throw std::runtime_error("TMVA-SOFIE: constant tensor with name " + tensor_name + " already exists \n"); } InitializedTensor new_tensor {type, shape, data, true}; // add here flag to specify is a constant tensor fInitializedTensors[tensor_name] = new_tensor; } +void RModel::AddShapeTensor(const std::string & name, const std::vector & shape_values, bool scalar){ + auto tensor_name = UTILITY::Clean_name(name); + if (fShapeTensors.count(tensor_name) != 0) { + throw std::runtime_error("TMVA-SOFIE: shape tensor with name " + tensor_name + " already exists \n"); + } + fShapeTensors[tensor_name] = std::make_pair(shape_values, scalar); +} + +bool RModel::IsShapeTensor(const std::string & tensor_name) const { + return fShapeTensors.count(tensor_name) != 0; +} + +const std::vector & RModel::GetShapeTensorValues(const std::string & tensor_name) const { + //if (!IsShapeTensor(tensor_name) ) return std::vector{}; + return fShapeTensors.at(tensor_name).first; +} + bool RModel::IsInitializedTensor(const std::string& tensorName) const { std::string name = UTILITY::Clean_name(tensorName); return fInitializedTensors.find(name) != fInitializedTensors.end(); } + bool RModel::IsConstantTensor(const std::string& tensorName) const { std::string name = UTILITY::Clean_name(tensorName); auto itr = fInitializedTensors.find(name); @@ -209,9 +221,11 @@ bool RModel::IsConstantTensor(const std::string& tensorName) const { return itr->second.IsConstantTensor(); } +// dynamic tensors include also Dim input tensors bool RModel::IsDynamicTensor(const std::string& tensorName) const { std::string name = UTILITY::Clean_name(tensorName); - return fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end(); + bool ret = fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end(); + return (ret) ? true : IsDimInputTensor(tensorName); } bool RModel::IsDimInputTensor(const std::string& tensorName) const { std::string name = UTILITY::Clean_name(tensorName); @@ -250,17 +264,21 @@ void RModel::AddDynamicTensor(std::string tensor_name, ETensorType type, std::ve // store shape parameter if not existing for (auto &d : shape) { if (d.isParam) { - if (fShapeParams.count(d.param) == 0) { - // case parameter is an expression of some other existing parameter, no need to - // register it - if (d.dim != size_t(-1)) { - fShapeParams[d.param] = std::to_string(d.dim); - } + if (d.dim != size_t(-1)) { + AddShapeParam(d.param, d.dim); } } } } +void RModel::AddShapeParam(const std::string & param, size_t default_value) { + if (fShapeParams.count(param) == 0) { + fShapeParams[param] = std::to_string(default_value); + // add also in the vector list (used to keep the order) + fDimShapeNames.push_back(param); + } +} + void RModel::AddOutputTensorNameList(std::vector outputtensornames) { fOutputTensorNames.clear(); for(auto& it : outputtensornames) { @@ -293,6 +311,15 @@ std::shared_ptr RModel::GetInitializedTensorData(std::string tensor_name) } } +void RModel::RemoveInitializedTensor(std::string tensor_name) { + auto f = fInitializedTensors.find(tensor_name); + if (f == fInitializedTensors.end()) { + throw std::runtime_error("TMVA-SOFIE: tensor " + tensor_name + " not found when trying to remove it"); + } else { + fInitializedTensors.erase(f); + } +} + void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) { auto t = fInitializedTensors.find(tensor_name); if (t == fInitializedTensors.end()) { @@ -301,100 +328,180 @@ void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) { t->second.SetNotWritable(); } -std::string RModel:: AllocateIntermediateMemory(std::span op_output_tensors) { +std::string RModel::AllocateIntermediateMemory(std::span op_output_tensors) +{ + std::stringstream code; - std::string memory_allocation_string = ""; - bool allocated; + if (fVerbose) { + std::cout << "Total chunks allocated\n"; + for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) { + std::cout << "..... chunk " << chunk->first << " size " << chunk->second.tensor_size << " " << chunk->second.tensor_name << std::endl; + } + } - for (auto& it : op_output_tensors) { - allocated = false; - if (GetTensorType(std::string(it)) == ETensorType::BOOL || - fInitializedTensors.find(std::string(it)) != fInitializedTensors.end() || - fDynamicTensorInfos.find(std::string(it)) != fDynamicTensorInfos.end()) continue; + auto declareIntermediateTensor = [this, &code](std::string const &name, size_t size, size_t location) { + std::string typeName = ConvertTypeToString(GetTensorType(name)); + code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes"; + code << "\n" + << typeName << "* tensor_" << name << " = reinterpret_cast<" << typeName + << "*>(fIntermediateMemoryPool.data() + " << location << ");\n"; + }; + + if (fVerbose) std::cout << "*** AllocateIntermediateMemory: Loop on op output tensors\n"; + // order output tensors by size + std::vector ordered_output_tensors; + + for (auto &it : op_output_tensors) { + auto name = std::string(it); + if (GetTensorType(name) == ETensorType::BOOL || fInitializedTensors.find(name) != fInitializedTensors.end() || + fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end()) + continue; + + auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name)); + // important fill the pair in the ordered output tensors with the string view and not the string + TensorMemoryInfo tmi = {it, tensor_size}; + ordered_output_tensors.push_back(tmi); + } + std::sort(ordered_output_tensors.begin(), ordered_output_tensors.end(), + [](const TensorMemoryInfo &a, const TensorMemoryInfo &b) { return a.tensor_size > b.tensor_size; }); - auto tensor_size = GetTypeSize(GetTensorType(std::string(it))) * ConvertShapeToLength(GetTensorShape(std::string(it))); - memory_allocation_string += "\n // Allocating memory for intermediate tensor " + std::string(it) + " with size " + std::to_string(tensor_size) + " bytes"; + for (auto &it : ordered_output_tensors) { + bool allocated = false; + std::string name = std::string{it.tensor_name}; + size_t tensor_size = it.tensor_size; + if (fVerbose) + std::cout << "output tensor " << name << " size " << tensor_size << std::endl; - for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); chunk != fIntermediateMemoryInfo.available_stack.end(); ) { + for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); + chunk != fIntermediateMemoryInfo.available_stack.end();) { - // check if available memory chunks can accommodate the tensor - if (chunk->second >= tensor_size) { - auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it, tensor_size); - auto new_chunk_location = chunk->first+chunk->second-tensor_size; - fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk; + if (fVerbose) std::cout << ".. available chunk " << chunk->first << " with size = " << chunk->second; + // check if available memory chunks can accommodate the tensor + if (chunk->second >= tensor_size) { + // need to use here string_view (i.e it.tensor_name) + // split returns the new chunk with size of new tensor. The free chunk is before the used one + auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it.tensor_name, tensor_size); + auto new_chunk_location = chunk->first + chunk->second - tensor_size; + fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk; - memory_allocation_string += "\n" + ConvertTypeToString(GetTensorType(std::string(it))) + - "* tensor_" + std::string(it) + - " = reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(new_chunk_location) + ");\n"; - chunk->second -= tensor_size; + declareIntermediateTensor(name, tensor_size, new_chunk_location); + chunk->second -= tensor_size; - allocated = true; + allocated = true; - if (chunk->second == 0) { - chunk = fIntermediateMemoryInfo.available_stack.erase(chunk); - } + if (fVerbose) std::cout << " is re-used and split in a new of size " << new_chunk.tensor_size << " at " << new_chunk_location; - break; - } - ++chunk; + if (chunk->second == 0) { + if (fVerbose) std::cout << " and deleted since size matches"; + fIntermediateMemoryInfo.available_stack.erase(chunk); } + if (fVerbose) std::cout << std::endl; + break; + } else if (chunk->first == fIntermediateMemoryInfo.available_stack.rbegin()->first && + fIntermediateMemoryInfo.total_stack.rbegin()->first == chunk->first) { + // case last available chunk is the last in the memory, we can increase that one + fIntermediateMemoryInfo.total_stack[chunk->first] = {it.tensor_name, tensor_size}; + declareIntermediateTensor(name, tensor_size, chunk->first); + fIntermediateMemoryInfo.available_stack.erase(chunk); + allocated = true; + if (fVerbose) std::cout << " is extended with a bigger one of size " << tensor_size << std::endl; + break; + } + ++chunk; + if (fVerbose) std::cout << std::endl; + } - if (!allocated) { - size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty() - ? 0 - : fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size; + if (!allocated) { + size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty() + ? 0 + : fIntermediateMemoryInfo.total_stack.rbegin()->first + + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size; - fIntermediateMemoryInfo.total_stack[chunk_idx] = - { - it, - tensor_size - }; + fIntermediateMemoryInfo.total_stack[chunk_idx] = it; - memory_allocation_string += "\n"+ConvertTypeToString(GetTensorType(std::string(it)))+"* tensor_"+ std::string(it) + "= reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(chunk_idx) + ");\n"; - } + declareIntermediateTensor(name, tensor_size, chunk_idx); + + if (fVerbose) std::cout << "no chunk available - add in total stack a new chunk with size of tensor and idx : " << chunk_idx + << std::endl; + } } - return memory_allocation_string; + return code.str(); } -void RModel::CheckAndFlushIntermediateMemory(std::span op_input_tensors, const size_t& op_idx){ - for (auto &it : op_input_tensors){ +void RModel::CheckAndFlushIntermediateMemory(std::span op_input_tensors, const size_t& op_idx){ + if (fVerbose) std::cout << "*** CheckAndFlushIntermediateMemory: Loop on input tensors for op " << op_idx << "\n"; + //print available chunks + if (fVerbose) std::cout << "available chunks before freeing them : \n"; + for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); + chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) { + if (fVerbose) std::cout << "-- free chunk " << chunk->first << " size = " << chunk->second << std::endl; + } + for (auto &it : op_input_tensors) { // last occurence of the tensor is reached => flush it from memory + if (fVerbose) std::cout << ".. input tensors : " << it; if (fIntermediateTensorFrequencyLookup[it] == op_idx) { + if (fVerbose) std::cout << " flash condition is met - looping on chunks to find matching one \n"; for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); - chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk ) { - if (chunk->second.tensor_name == it) { - - // check if nearby chunks in available memory can coalesce - auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(chunk->first); // smallest element greater than the flushed chunk idx - auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) ? fIntermediateMemoryInfo.available_stack.end() : std::prev(first_greater); // largest element smaller than the flushed chunk idx - - // check if the next stack entry is actually adjacent in memory - if (last_smaller->first+last_smaller->second + 1 == chunk->first){ - last_smaller->second += chunk->second.tensor_size; - fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second); - - if (last_smaller->first + last_smaller->second + 1 == first_greater->first){ - fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]); - first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater); - } - } else{ - if (chunk->first + chunk->second.tensor_size + 1 == first_greater->first){ - fIntermediateMemoryInfo.total_stack[chunk->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]); - first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater); - } - fIntermediateMemoryInfo.available_stack.insert({ - chunk->first, - chunk->second.tensor_size - }); - } + chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) { + if (fVerbose) std::cout << "--- chunk " << chunk->first << " , " << chunk->second.tensor_name << " size " << chunk->second.tensor_size; + if (chunk->second.tensor_name == it) { + if (fVerbose) std::cout << " -- Found chunk corresponding to input tensor: " << chunk->first; + // check if nearby chunks in available memory can coalesce + auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound( + chunk->first); // smallest element greater than the flushed chunk idx + auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) + ? fIntermediateMemoryInfo.available_stack.end() + : std::prev(first_greater); // largest element smaller than the flushed chunk idx + + // check if the next stack entry is actually adjacent in memory + + if (last_smaller != fIntermediateMemoryInfo.available_stack.end() && + last_smaller->first + last_smaller->second == chunk->first) { + // merge chunk with previous one + last_smaller->second += chunk->second.tensor_size; + fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second); + if (fVerbose) std::cout << " is adjacent in memory with previous one - merge "; + if (first_greater != fIntermediateMemoryInfo.available_stack.end() && + last_smaller->first + last_smaller->second == first_greater->first) { + // merge also with following one + last_smaller->second += first_greater->second; + fIntermediateMemoryInfo.total_stack[last_smaller->first].merge( + fIntermediateMemoryInfo.total_stack[first_greater->first]); + // delete merged one in available stack and in total stack + fIntermediateMemoryInfo.total_stack.erase(first_greater->first); + fIntermediateMemoryInfo.available_stack.erase(first_greater); + if (fVerbose) std::cout << " merge also with following that is free "; + } + fIntermediateMemoryInfo.total_stack.erase(chunk->first); + if (fVerbose) std::cout << std::endl; + break; + } else if (first_greater != fIntermediateMemoryInfo.available_stack.end() && + chunk->first + chunk->second.tensor_size == first_greater->first) { + // merge with first greater + if (fVerbose) std::cout << " is adjacent in memory with following one - merge \n"; + // cannot modify idx of first_greter. Insert a new one and delete previous one + size_t new_size = chunk->second.tensor_size + first_greater->second; + size_t first_greater_idx = first_greater->first; + fIntermediateMemoryInfo.available_stack.erase(first_greater); + // cannot use anymore first_greater + fIntermediateMemoryInfo.available_stack.insert({chunk->first, new_size}); + fIntermediateMemoryInfo.total_stack[chunk->first].merge( + fIntermediateMemoryInfo.total_stack[first_greater_idx]); + fIntermediateMemoryInfo.total_stack.erase(first_greater_idx); + } else { + fIntermediateMemoryInfo.available_stack.insert({chunk->first, chunk->second.tensor_size}); + if (fVerbose) std::cout << " insert in the available stack the chunk with size " << chunk->second.tensor_size << std::endl; } + chunk->second.tensor_name = "free"; + break; + } } + } else { + if (fVerbose) std::cout << std::endl; } } } - - void RModel::Initialize(int batchSize, bool verbose) { std::map inputParams; if (batchSize > 0) { @@ -442,7 +549,7 @@ void RModel::Initialize(const std::map & inputParams, bool auto shape = ConvertShapeToInt(input.second.shape); if (verbose) std::cout << "converting input shape for " << input.first << " " << ConvertShapeToString(shape) << " from " - << ConvertDynamicShapeToString(input.second.shape) << std::endl; + << ConvertDimShapeToString(input.second.shape) << std::endl; if (!shape.empty()) { // case shape is defined (not parametric) we add the tensor in the fReadyInputTensorInfos map and // we remove the tensor from the fInputTensorInfo where th eold parametric shape was stored @@ -456,8 +563,12 @@ void RModel::Initialize(const std::map & inputParams, bool else { // store the found parametric shape parameters for (auto &d : input.second.shape) { - if (d.isParam) - fShapeParams[d.param] = std::to_string(d.dim); + if (d.isParam) { + if (fShapeParams.count(d.param) == 0) { + fDimShapeNames.push_back(d.param); + fShapeParams[d.param] = std::to_string(d.dim); + } + } } } } @@ -492,10 +603,11 @@ void RModel::Initialize(const std::map & inputParams, bool } fOperators[op_idx]->Initialize(*this); for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){ + std::string name = std::string{it}; if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() && - std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), std::string(it)) == fOutputTensorNames.end() && - fInitializedTensors.find(std::string(it)) == fInitializedTensors.end() && - fDynamicTensorInfos.find(std::string(it)) == fDynamicTensorInfos.end()){ + std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() && + fInitializedTensors.find(name) == fInitializedTensors.end() && + fDynamicTensorInfos.find(name) == fDynamicTensorInfos.end()){ fIntermediateTensorFrequencyLookup[it] = op_idx; } } @@ -534,54 +646,21 @@ void RModel::InitializeSubGraph(std::shared_ptr graph) { } -// Function to generate the code for declaring and initializing constant tensors -// This is for tensors which are not part of weight files and can be created from the Constant operator -template -std::string GenerateConstantTensorCode(const std::pair &t) -{ - std::stringstream strs; - std::string type = ConvertTypeToString(t.second.type()); - size_t length = ConvertShapeToLength(t.second.shape()); - // avoid using stack sizes for constant tensors to reduce compilation time - bool allocateOnStack = (length > 100) ? false : true; - - const T *data = t.second.data(); - - // and check if all values are the same - bool sameData = false; - // for non stack allocation check if data are the same - if (!allocateOnStack && length > 1) { - size_t idx = 1; - do { - sameData = (data[idx] == data[idx - 1]); - idx++; - } while (sameData && idx < length); - } - if (allocateOnStack) { - strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n"; - } else { - strs << "std::vector<" << type << "> fTensor_" << t.first << " = "; - if (sameData) - strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n"; - else { - strs << ConvertValuesToString(length, data) << ";\n"; - } - strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n"; - } - return strs.str(); -} - void RModel::GenerateInitializedTensorInfo() { if (!fInitializedTensors.empty()) fGC += "// initialized tensors\n"; for (auto &i : fInitializedTensors) { + if (i.second.IsNotWritable()) continue; if (!fUseWeightFile || i.second.IsConstantTensor()) { - if (i.second.type() == ETensorType::FLOAT) + if (i.second.type() == ETensorType::FLOAT) { fGC += GenerateConstantTensorCode(i); - else if (i.second.type() == ETensorType::INT64) + fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 4; + } else if (i.second.type() == ETensorType::INT64) { fGC += GenerateConstantTensorCode(i); + fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 8; + } } else { // case of tensors which are read from a file @@ -589,43 +668,55 @@ void RModel::GenerateInitializedTensorInfo() if (i.second.type() == ETensorType::FLOAT) { fGC += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; fGC += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; + fWeightsTensorSize += ConvertShapeToLength(i.second.shape()) * 4; } } } } void RModel::GenerateIntermediateMemoryPool() { - if (fIntermediateMemoryInfo.total_stack.size() == 0) return; + if (fIntermediateMemoryInfo.total_stack.empty()) return; fGC += "\n//--- Allocating session memory pool to be used for allocating intermediate tensors\n"; // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors // of other data types - fGC += "char* fIntermediateMemoryPool = new char[" + std::to_string(fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size)+ "];\n\n"; + auto const &totalStack = fIntermediateMemoryInfo.total_stack; + const size_t memPoolSize = totalStack.rbegin()->first + totalStack.rbegin()->second.tensor_size; + fGC += "std::vector fIntermediateMemoryPool = std::vector(" + std::to_string(memPoolSize) + ");\n\n"; } void RModel::GenerateIntermediateTensorInfo() { if (!fIntermediateTensorInfos.empty()) { std::string tensor_declaration_block = ""; - for (auto &i : fIntermediateTensorInfos) { if (i.second.type == ETensorType::BOOL) { - tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n"; - // No pointer allocation needed for BOOL + tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n"; + tensor_declaration_block += "std::uint8_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; + continue; } - if (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end() && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()) { + bool is_extended = (fOptimizationLevel == OptimizationLevel::kExtended); + bool not_in_freq_map = + (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end()); + bool not_in_output_names = + (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()); + + if ((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names)) { size_t length = ConvertShapeToLength(i.second.shape); if (i.second.type == ETensorType::FLOAT) { tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; tensor_declaration_block += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 4 * length; } else if (i.second.type == ETensorType::DOUBLE) { tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; tensor_declaration_block += "double * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 8 * length; } else if (i.second.type == ETensorType::INT64) { tensor_declaration_block += "std::vector fTensor_" + i.first + " = std::vector(" + std::to_string(length) + ");\n"; tensor_declaration_block += "int64_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n"; + fOtherTensorSize += 8 * length; } } } @@ -664,17 +755,17 @@ void RModel::GenerateOperatorDeclarations() { fGC += "\n"; } -void RModel::GenerateDynamicTensorInfo() { - fGC += "//---- allocate the intermediate dynamic tensors\n"; - std::stringstream out; - for (auto & i: fDynamicTensorInfos) { - auto length = ConvertDynamicShapeToLength(i.second.shape); - out << SP << "if (" << length << " > 0) {\n"; - out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n"; - out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n"; - out << SP << "}\n"; - } - fGC += out.str(); +void RModel::GenerateDynamicTensorInfo() +{ + std::stringstream out; + for (auto &i : fDynamicTensorInfos) { + auto length = ConvertDimShapeToLength(i.second.shape); + out << SP << "if (" << length << " > 0) {\n"; + out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n"; + out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n"; + out << SP << "}\n"; + } + fGC += out.str(); } std::string RModel::GenerateInferSignature(bool isdecl) { @@ -702,7 +793,7 @@ std::string RModel::GenerateInferSignature(bool isdecl) { if (type == "other") throw std::runtime_error("TMVA-SOFIE: input tensor " + name + " is of a data type which is not yet supported."); - rGC += type + "* "; + rGC += type + " const* "; } rGC += "tensor_" + name + ","; i_input++; @@ -712,96 +803,73 @@ std::string RModel::GenerateInferSignature(bool isdecl) { return rGC; } -namespace { - -std::string createOutputTensor(RModel const &rmodel, std::string const &name, bool isIntermediateTensor) +void RModel::GenerateOutput() { - if(name.empty()) return "{}"; - ETensorType eOutputType = rmodel.GetTensorType(name); - std::string outputType = ConvertTypeToString(eOutputType); - if (isIntermediateTensor) { - - if (eOutputType == ETensorType::BOOL) { - return "fTensor_" + name; - } else { - // need to check is size is the same(don't want to return a vector with larger size) - // in that case better to copy - return "std::vector<" + ConvertTypeToString(eOutputType) + ">(tensor_" + name + ", tensor_" + name + " + " + - std::to_string(ConvertShapeToLength(rmodel.GetTensorShape(name))) + ")"; - } - } - // include also dynamic tensors since the vectors can be allocated with a size larger than their output - // we need a special handling for bool type allocated as vector - auto outputLength = ConvertDynamicShapeToLength(rmodel.GetDynamicTensorShape(name)); - if (rmodel.IsDynamicTensor(name) && eOutputType == ETensorType::BOOL) { - return "std::vector(fTensor_" + name + ".begin(), fTensor_" + name + ".begin() + " + outputLength + ")"; - } - return "std::vector<" + outputType + ">(tensor_" + name + ", tensor_" + name + " + " + outputLength + ")"; -} - -} // namespace - -void RModel::GenerateOutput() { - - if (fVerbose) - std::cout << "Generating main inference code for " << fName << std::endl; - size_t outputSize = fOutputTensorNames.size(); // assume output types are all the same - if (outputSize == 0) - throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported"); bool sameOutputTypes = true; std::string inferReturnType; // type return by infer function - ETensorType eOutputType = GetTensorType(*fOutputTensorNames.begin()); - std::string outputType = ConvertTypeToString(eOutputType); + ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin()); fGC += "\n\n"; if (outputSize == 1) { - fGC += "std::vector<" + outputType + ">"; + fGC += "std::vector<" + ConvertOutputTypeToString(eFirstOutputType) + ">"; } else { // if all output types are the same we return an std::vector - otherwise a tuple - for (size_t i = 1; i < outputSize; i++) { - if (GetTensorType(fOutputTensorNames[i]) != eOutputType) + for (std::string const &name : fOutputTensorNames) { + if (GetTensorType(name) != eFirstOutputType) sameOutputTypes = false; } if (sameOutputTypes) - fGC += "std::vector>"; + fGC += "std::vector>"; else { inferReturnType = "std::tuple<"; for (size_t i = 0; i < outputSize; i++) { - inferReturnType += "std::vector<" + ConvertTypeToString(GetTensorType(fOutputTensorNames[i])) + ">"; - if (i < outputSize-1) inferReturnType += ","; + inferReturnType += "std::vector<" + ConvertOutputTypeToString(GetTensorType(fOutputTensorNames[i])) + ">"; + if (i < outputSize - 1) + inferReturnType += ","; } inferReturnType += ">"; fGC += inferReturnType; } } - fGC += " infer("; + fGC += " infer(" + GenerateInferSignature() + "){\n"; - fGC += GenerateInferSignature(); - - fGC += "){\n"; - - for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { - if (fVerbose) std::cout << "Generating code for operator .... " << op_idx << std::endl; - fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx))); + std::string doInferArgs = GenerateInferSignature(false); + if (!doInferArgs.empty()) + doInferArgs += ","; + for (std::string const &name : fOutputTensorNames) { + fGC += SP + "std::vector<" + ConvertOutputTypeToString(GetTensorType(name)) + " > output_tensor_" + name + ";\n"; + doInferArgs += " output_tensor_" + name + ","; } + if (!doInferArgs.empty()) + doInferArgs.back() = ' '; + + fGC += SP + "doInfer(" + doInferArgs + ");\n"; fGC += SP + "return {"; - for (size_t i = 0; i < outputSize; i++) { - std::string tensorName = *(fOutputTensorNames.begin() + i); - bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0; - fGC += createOutputTensor(*this, tensorName, isIntermediate); - if (i < outputSize - 1) + for (size_t i = 0; i < fOutputTensorNames.size(); i++) { + fGC += "output_tensor_" + fOutputTensorNames[i]; + if (i < fOutputTensorNames.size() - 1) fGC += ","; } fGC += "};\n"; - fGC += "}\n"; // end of infer function scope + fGC += "}\n"; // end of infer function scope } void RModel::GenerateSessionCode() { + // Determine the signature of the actual inference function + std::string doInferSignature = GenerateInferSignature(); + if (!doInferSignature.empty()) + doInferSignature += ", "; + for (auto const &name : fOutputTensorNames) { + doInferSignature += " std::vector<" + ConvertOutputTypeToString(GetTensorType(name)) + "> &output_tensor_" + name + ","; + } + doInferSignature.back() = ' '; + + doInferSignature = "void doInfer(" + doInferSignature + ")"; // define the Session struct (for GNN this is generated in RModel_GNN) if (fUseSession && !fIsGNNComponent) { @@ -814,24 +882,31 @@ void RModel::GenerateSessionCode() // generate code for declaring the initialized tensors GenerateInitializedTensorInfo(); - // evaluate total intermediate memory and position intermediate tensor addresses - std::string intermediate_memory_alloc_string = ""; - intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --"; - for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { - intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors()); - CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx); - } + if (fOptimizationLevel == OptimizationLevel::kExtended) { + // evaluate total intermediate memory and position intermediate tensor addresses + std::string intermediate_memory_alloc_string = ""; + intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --"; + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) { + auto op = fOperators[op_idx].get(); + std::cout << "\n******************\n analyzing input/output operator " << op_idx << " " + << typeid(*op).name() << std::endl; + } + intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors()); + CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx); + } - // to check remaining unused fragments after memory allocation (lesser the better) - // for (const auto &it: fIntermediateMemoryInfo.available_stack){ - // std::cout<<"chunk_idx: "<Generate(std::to_string(op_idx))); + } + + fGC += SP + "using SOFIE::UTILITY::FillOutput;\n\n"; + + for (std::string const &name : fOutputTensorNames) { + // need to check is size is the same (don't want to return a vector with + // larger size) in that case better to copy + bool isIntermediate = fIntermediateTensorInfos.count(name) > 0; + std::string n = isIntermediate ? std::to_string(ConvertShapeToLength(GetTensorShape(name))) + : ConvertDimShapeToLength(GetDimTensorShape(name)); + fGC += SP + "FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n"; + } + + fGC += "}\n\n"; + + // generate the inference overload that returns an output struct GenerateOutput(); // end of session if (fUseSession && !fIsGNNComponent) { - fGC += "}; // end of Session\n"; + fGC += "}; // end of Session\n\n"; } } @@ -982,8 +1087,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) { fGC += " f.seekg(" + std::to_string(pos) + ");\n"; } - fGC += " std::string tensor_name;\n"; - fGC += " size_t length;\n"; + fGC += " using SOFIE::ReadTensorFromStream;\n"; // loop on tensors and parse the file for (auto& i: fInitializedTensors) { @@ -991,25 +1095,8 @@ void RModel::ReadInitializedTensorsFromFile(long pos) { if (!i.second.IsWeightTensor()) continue; std::string tensor_name = "tensor_" + i.first; if (i.second.type() == ETensorType::FLOAT) { - size_t length = 1; - length = ConvertShapeToLength(i.second.shape()); - std::string slength = std::to_string(length); - fGC += " f >> tensor_name >> length;\n"; - fGC += " if (tensor_name != \"" + tensor_name + "\" ) {\n"; - fGC += " std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor name; expected name is " + - tensor_name + " , read \" + tensor_name;\n"; - fGC += " throw std::runtime_error(err_msg);\n"; - fGC += " }\n"; - fGC += " if (length != " + slength + ") {\n"; - fGC += " std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor size; expected size is " + - slength + " , read \" + std::to_string(length) ;\n"; - fGC += " throw std::runtime_error(err_msg);\n"; - fGC += " }\n"; - fGC += " for (size_t i = 0; i < length; ++i)\n"; - fGC += " f >> " + tensor_name + "[i];\n"; - fGC += " if (f.fail()) {\n"; - fGC += " throw std::runtime_error(\"TMVA-SOFIE failed to read the values for tensor " + tensor_name + "\");\n"; - fGC += " }\n"; + std::string length = std::to_string(ConvertShapeToLength(i.second.shape())); + fGC += " ReadTensorFromStream(f, " + tensor_name + ", \"" + tensor_name + "\", " + length + ");\n"; } else { std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file"); } @@ -1019,6 +1106,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) { // generate the code to read initialized tensors from a ROOT data file if(fWeightFile == WeightFileType::RootBinary) { +#ifdef SOFIE_SUPPORT_ROOT_BINARY fGC += " {\n"; fGC += " std::unique_ptr rootFile(TFile::Open(filename.c_str(), \"READ\"));\n"; fGC += " if (!rootFile->IsOpen()) {\n"; @@ -1050,6 +1138,9 @@ void RModel::ReadInitializedTensorsFromFile(long pos) { fGC += " }\n"; } fGC += " }\n"; +#else + throw std::runtime_error("SOFIE was not built with ROOT file support."); +#endif // SOFIE_SUPPORT_ROOT_BINARY } } @@ -1075,6 +1166,7 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) { // Write the initialized tensors to the file if (fWeightFile == WeightFileType::RootBinary) { +#ifdef SOFIE_SUPPORT_ROOT_BINARY if(fIsGNNComponent || fIsGNN) { throw std::runtime_error("SOFIE-GNN yet not supports writing to a ROOT file."); } @@ -1118,6 +1210,9 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) { // this needs to be changed, similar to the text file return -1; +#else + throw std::runtime_error("SOFIE was not built with ROOT file support."); +#endif // SOFIE_SUPPORT_ROOT_BINARY } else if (fWeightFile == WeightFileType::Text) { std::ofstream f; if(fIsGNNComponent) { @@ -1244,9 +1339,9 @@ void RModel::PrintOutputTensors() { for (auto& it: fOutputTensorNames) { std::cout << "Tensor name: \"" << it << "\"\t"; if (!IsDynamicTensor(it)) - std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl; - else - std::cout << "shape: " << ConvertDynamicShapeToString(GetDynamicTensorShape(it)) << std::endl; + std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl; + else + std::cout << "shape: " << ConvertDimShapeToString(GetDimTensorShape(it)) << std::endl; } std::cout << "\n"; } @@ -1312,13 +1407,13 @@ void RModel::OutputGenerated(std::string filename, bool append) { void RModel::Streamer(TBuffer &R__b) { if (R__b.IsReading()) { RModel::Class()->ReadBuffer(R__b, this); - for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) { - i->second.CastPersistentToShared(); + for (auto & i : fInitializedTensors) { + i.second.CastPersistentToShared(); } } else { - for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) { - i->second.CastSharedToPersistent(); + for (auto & i : fInitializedTensors) { + i.second.CastSharedToPersistent(); } RModel::Class()->WriteBuffer(R__b, this); } diff --git a/src/SOFIE_core/src/RModel_ALPAKA.cxx b/src/SOFIE_core/src/RModel_ALPAKA.cxx new file mode 100644 index 0000000..f1945b7 --- /dev/null +++ b/src/SOFIE_core/src/RModel_ALPAKA.cxx @@ -0,0 +1,447 @@ +#include +#include +#include +#include +#include + +#include "TFile.h" +#include "SOFIE/RModel.hxx" +#include "SOFIE/SOFIE_common.hxx" + +namespace SOFIE { + +void RModel::GenerateInitializedTensorInfo_GPU_ALPAKA() { + if (!fInitializedTensors.empty()){ + fGC += "\n// initialized tensors for weights\n"; + } + + for (auto &i : fInitializedTensors) { + if (!fUseWeightFile || i.second.IsConstantTensor()) { + if (i.second.type() == ETensorType::FLOAT) + fGC += GenerateConstantTensorCode(i); + else if (i.second.type() == ETensorType::INT64) + fGC += GenerateConstantTensorCode(i); + + } + // case of tensors which are read from a file + size_t length = ConvertShapeToLength(i.second.shape()); + if (i.second.type() == ETensorType::FLOAT) { + fGC += "BufF1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type() == ETensorType::INT64) { + fGC += "BufI641D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } + + } +} + +void RModel::GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA() +{ + if (!fInitializedTensors.empty()) + fGC += "// temporary initialized tensors for loading weights\n"; + + for (auto &i : fInitializedTensors) { + if (fUseWeightFile && !i.second.IsConstantTensor()) { + // case of tensors which are read from a file + size_t length = ConvertShapeToLength(i.second.shape()); + if (i.second.type() == ETensorType::FLOAT) { + fGC += "std::vector tensor_" + i.first + "(" + std::to_string(length) + ");\n"; + } + } + } +} + +void RModel::GenerateGPU_ALPAKA_Buffers() { + if (!fIntermediateTensorInfos.empty()) { + std::string tensor_declaration_block = ""; + + for (auto &i : fIntermediateTensorInfos) { + if (i.second.type == ETensorType::BOOL) { + tensor_declaration_block += "std::vector fTensor_" + i.first + + " = std::vector(" + + std::to_string(ConvertShapeToLength(i.second.shape)) + + ");\n"; + // No pointer allocation needed for BOOL + } + + size_t length = ConvertShapeToLength(i.second.shape); + + if (i.second.type == ETensorType::FLOAT) { + tensor_declaration_block += "BufF1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::DOUBLE) { + tensor_declaration_block += "BufD1D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } else if (i.second.type == ETensorType::INT64) { + tensor_declaration_block += "BufI641D deviceBuf_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" + + std::to_string(length) + "}));\n"; + } + } + + if (tensor_declaration_block.length()) { + fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block; + } + } + + // add also the dynamic tensors (only declarations, allocation will be done later) + if (!fDynamicTensorInfos.empty()) { + fGC += "//--- declare the dynamic tensors\n"; + fGC += "using bufDev_float = alpaka::Buf, size_t>;\n"; + fGC += "using bufDev_double = alpaka::Buf, size_t>;\n"; + fGC += "using bufDev_int64 = alpaka::Buf, size_t>;\n"; + + for (auto &i : fDynamicTensorInfos) { + if (i.second.type == ETensorType::FLOAT) { + fGC += "bufDev_float bufDev_" + i.first + ";\n"; + } else if (i.second.type == ETensorType::DOUBLE) { + fGC += "bufDev_double bufDev_" + i.first + ";\n"; + } else if (i.second.type == ETensorType::INT64) { + fGC += "bufDev_int64 bufDev_" + i.first + ";\n"; + } + } + } +} + +void RModel::GenerateDynamicTensorInfo_GPU_ALPAKA() { + fGC += "//---- allocate the intermediate dynamic tensors\n"; + std::stringstream out; + + for (auto &i : fDynamicTensorInfos) { + auto length = ConvertDimShapeToLength(i.second.shape); + out << SP << "if (" << length << " > 0) {\n"; + out << "auto bufDev_" + i.first + + " = alpaka::allocBuf(devAcc, Ext1D::all(Idx{" << length << "}));\n"; + out << SP << "}\n"; + } + fGC += out.str(); +} + +std::string RModel::GenerateInferSignature_GPU_ALPAKA(bool isdecl) { + // generate the infer signature given the inputs: eg. "BufF1D const deviceBuf_A, BufF1D const deviceBuf_B" + // if (isdecl = false) generate only calling signature (deviceBuf_A, deviceBuf_B, ....) + + auto GetBufType = [this](const std::string& name) -> std::string { + ETensorType type = GetTensorType(name); + if (type == ETensorType::FLOAT) return "BufF1D"; + if (type == ETensorType::DOUBLE) return "BufD1D"; + if (type == ETensorType::INT64) return "BufI641D"; + throw std::runtime_error("TMVA-SOFIE: input tensor " + name + + " is of a data type which is not yet supported."); + }; + + std::string rGC; + std::unordered_map inputParams; + int i_input = 0; + for (auto &name : fInputTensorNames) { + // if is a dynamic tensor pass initial parameters + if (IsDimInputTensor(name)) { + auto shape = GetDynamicTensorShape(name); + for (auto &d : shape) { + std::string pName = d.param; + if (d.isParam && inputParams.count(pName) == 0) { + if (isdecl) rGC += "size_t "; + rGC += d.param + ","; + inputParams[pName] = i_input; + } + } + } + if (isdecl) { + rGC += GetBufType(name) + " const "; + } + rGC += "deviceBuf_" + name + ","; + i_input++; + } + + if (fInputTensorNames.size() > 0) rGC.pop_back(); // remove last "," + return rGC; +} + +void RModel::GenerateOutput_GPU_ALPAKA() { + if (fVerbose) + std::cout << "Generating main inference code for " << fName << std::endl; + + size_t outputSize = fOutputTensorNames.size(); + if (outputSize == 0) + throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported"); + + bool sameOutputTypes = true; + std::string inferReturnType; + ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin()); + + fGC += "\n\n"; + if (outputSize == 1) { + fGC += "alpaka::Buf"; + } else { + // if all output types are the same we return an std::vector - otherwise a tuple + for (std::string const &name : fOutputTensorNames) { + if (GetTensorType(name) != eFirstOutputType) + sameOutputTypes = false; + } + if (sameOutputTypes) + fGC += "std::array, " + std::to_string(outputSize) + ">"; + else { + inferReturnType = "std::tuple<"; + for (size_t i = 0; i < outputSize; i++) { + inferReturnType += "alpaka::Buf"; + if (i < outputSize - 1) + inferReturnType += ","; + } + inferReturnType += ">"; + fGC += inferReturnType; + } + } + + fGC += " infer("; + fGC += GenerateInferSignature_GPU_ALPAKA(); + fGC += "){\n"; + + for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) { + if (fVerbose) + std::cout << "Generating code for operator .... " << op_idx << std::endl; + fGC += (fOperators[op_idx]->Generate_GPU_ALPAKA(std::to_string(op_idx))); + } + + // fGC += "\n\n alpaka::wait(queue);\n"; + fGC += SP + "return "; + if (outputSize>1) fGC += " {"; + for (size_t i = 0; i < outputSize; i++) { + std::string tensorName = *(fOutputTensorNames.begin() + i); + bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0; + fGC += "deviceBuf_"+tensorName; + if (i < outputSize - 1) + fGC += ","; + } + if (outputSize>1) fGC += " };\n"; + else fGC += ";\n"; + fGC += "}\n"; // end of infer function scope +} + +void RModel::GenerateSessionCode_GPU_ALPAKA() { + + std::set registered_operators; + std::set single_initialized_operators = { + SOFIE::OperatorKind::RELU, + SOFIE::OperatorKind::SIGMOID, + SOFIE::OperatorKind::TANH, + SOFIE::OperatorKind::SOFTMAX, + SOFIE::OperatorKind::LEAKYRELU, + SOFIE::OperatorKind::EINSUM, + SOFIE::OperatorKind::COMPARISON, + SOFIE::OperatorKind::ELU, + }; + bool OpNeedsBlas = false; + + // single initiation operators must only be initialized only once and their count should be stored in the registered_operators set to avoid generating multiple kernels for the same operator kind + fGC += "\n//--- ALPAKA Kernels\n"; + for (size_t id = 0; id < fOperators.size(); id++) { + if(fOperators[id]->GetKind() == OperatorKind::GEMM){ + OpNeedsBlas = true; + } + if(single_initialized_operators.find(fOperators[id]->GetKind()) != single_initialized_operators.end()) { + + if(registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) { + + if (fVerbose) + std::cout<<"Generating ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind()) << std::endl; + + fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id)); + registered_operators.insert(fOperators[id]->GetKind()); + } + } else { + if (fVerbose) + std::cout<<"Generating ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind()) << std::endl; + fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id)); + } + } + + // define the Session struct (for GNN this is generated in RModel_GNN) + fGC += "\n\ntemplate \n"; + if (fUseSession) { + if (!fIsSubGraph) + fGC += "struct Session {\n\n"; + else + fGC += "struct Session_" + fName + " {\n\n"; + } + + // define host and device accelerators + fGC += "using Idx = std::size_t;\n"; + fGC += "using Dim = alpaka::DimInt<1>;\n"; + fGC += "using Acc = alpaka::TagToAcc;\n"; + fGC += "using DevAcc = alpaka::Dev;\n\n"; + fGC += "using QueueProperty = alpaka::NonBlocking;\n"; + fGC += "using QueueAcc = alpaka::Queue;\n\n"; + fGC += "using BufF1D = alpaka::Buf;\n"; + fGC += "using BufD1D = alpaka::Buf;\n"; + fGC += "using BufI641D = alpaka::Buf;\n\n"; + + fGC += "\nalpaka::Platform const platform{};\n"; + fGC += "DevAcc devAcc = alpaka::getDevByIdx(platform, 0);\n"; + fGC += "alpaka::PlatformCpu platformHost{};\n"; + fGC += "alpaka::DevCpu hostAcc = alpaka::getDevByIdx(platformHost, 0);\n"; + fGC += "QueueAcc queue{devAcc};\n"; + fGC += "Idx threadsPerBlock = 256;\n"; + fGC += "\nusing Ext1D = alpaka::Vec;\n"; + fGC += "using Vec = alpaka::Vec;\n"; + if (OpNeedsBlas) { + fGC += "\n\n// BLAS declarations\n"; + fGC += "sofieBLAS blas{queue};\n"; + } + + GenerateInitializedTensorInfo_GPU_ALPAKA(); + GenerateGPU_ALPAKA_Buffers(); + GenerateOperatorDeclarations(); + + // add subgraph session + if (!fSubGraphs.empty()) + fGC += "// subgraph sessions\n"; + for (auto &graph : fSubGraphs) { + fGC += "Session_" + graph->fName + " fSession_" + graph->fName + ";\n"; + } + + // Session constructor + if (fUseSession) { + std::string sessionName = "\n\nSession"; + if (fIsSubGraph) + sessionName += "_" + fName; + + if (fUseWeightFile) { + std::string fileName = fName; + if (fWeightFile == WeightFileType::Text) + fileName += ".dat"; + if (fWeightFile == WeightFileType::RootBinary) + fileName += ".root"; + + fGC += sessionName + "(std::string filename =\"" + fileName + "\""; + } else { + fGC += sessionName + "(std::string = \"\""; + } + + if (!fShapeParams.empty()) { + for (auto &p : fShapeParams) { + fGC += ",\n"; + fGC += " size_t " + p.first + " = " + p.second; + } + } + fGC += ") {\n"; + + GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA(); + if (fUseWeightFile) { + fGC += "\n//--- reading weights from file\n"; + ReadInitializedTensorsFromFile(0); + fGC += "\n"; + } + + MoveInitializedTensorsToBuffers_ALPAKA(); + GenerateDynamicTensorInfo_GPU_ALPAKA(); + + for (size_t id = 0; id < fOperators.size(); id++) { + fGC += fOperators[id]->GenerateInitCode_GPU_ALPAKA(); + if (fOperators[id]->GetKind() == OperatorKind::GEMM){ + fGC += "\nblas.AddLayoutConfig("+fOperators[id]->GetBlasConfig()+");\n"; + } + } + + fGC += "\nalpaka::wait(queue);\n"; + fGC += "}\n\n"; + } + + registered_operators.clear(); + + for (size_t id = 0; id < fOperators.size(); id++) { + + if(single_initialized_operators.find(fOperators[id]->GetKind()) != single_initialized_operators.end()) { + + if(registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) { + + if (fVerbose) + std::cout<<"Declaring ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind()) << std::endl; + + fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id)); + registered_operators.insert(fOperators[id]->GetKind()); + } + } else { + if (fVerbose) + std::cout<<"Declaring ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind()) << std::endl; + fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id)); + } + } + + GenerateOutput_GPU_ALPAKA(); + + if (fUseSession && !fIsGNNComponent) { + fGC += "}; // end of Session\n"; + } +} + +void RModel::GenerateGPU_ALPAKA(std::underlying_type_t options, int batchSize, bool verbose) { + fVerbose = true; + fBatchSize = batchSize; + + if (static_cast>(Options::kNoSession) & options) { + fUseSession = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kNoWeightFile) & options) { + fUseWeightFile = false; + fWeightFile = WeightFileType::None; + } + if (static_cast>(Options::kRootBinaryWeightFile) & options) { + fUseWeightFile = true; + fWeightFile = WeightFileType::RootBinary; + } + if (fUseWeightFile && !fUseSession) { + throw std::runtime_error( + "TMVA-SOFIE: RModel::Generate: cannot use a separate weight file without generating a Session class"); + } + + if (static_cast>(Options::kGNN) & options || + static_cast>(Options::kGNNComponent) & options) + throw std::runtime_error("SOFIE GPU does not yet supports GNN Inference."); + + Initialize(batchSize, verbose); + + std::string hgname; + if (!fIsSubGraph) { + fGC.clear(); + GenerateHeaderInfo_GPU_ALPAKA(hgname); + } + + if (fVerbose) + std::cout << "generate Main session code - model " << fName << std::endl; + + GenerateSessionCode_GPU_ALPAKA(); + + if (!fIsSubGraph) { + fGC += ("} //SOFIE_" + fName + "\n"); + fGC += "\n#endif // " + hgname + "\n"; + } +} + +void RModel::MoveInitializedTensorsToBuffers_ALPAKA(){ + for (auto &i : fInitializedTensors) { + if (i.second.IsNotWritable()) continue; + std::string tensor_name = "tensor_" + i.first; + auto length = ConvertShapeToLength(i.second.shape()); + std::string slength = std::to_string(length); + if (i.second.type() == ETensorType::FLOAT) { + fGC += " auto hostBuf_"+i.first+" = alpaka::createView(hostAcc, tensor_"+i.first+");\n"; + fGC += " alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n"; + } else if (i.second.type() == ETensorType::DOUBLE) { + fGC += " auto hostBuf_"+i.first+" = alpaka::createView(hostAcc, tensor_"+i.first+");\n"; + fGC += " alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n"; + } else if (i.second.type() == ETensorType::INT64) { + fGC += " auto hostBuf_"+i.first+" = alpaka::createView(hostAcc, tensor_"+i.first+", " + slength + ");\n"; + fGC += " alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n"; + } else { + std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a ROOT file"); + } + } + } + +} // namespace SOFIE diff --git a/src/SOFIE_core/src/RModel_Base.cxx b/src/SOFIE_core/src/RModel_Base.cxx index d4d1f1c..f212c53 100644 --- a/src/SOFIE_core/src/RModel_Base.cxx +++ b/src/SOFIE_core/src/RModel_Base.cxx @@ -58,6 +58,38 @@ void RModel_Base::GenerateHeaderInfo(std::string& hgname) { } } +void RModel_Base::GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname) { + fGC += ("//Code generated automatically by TMVA for GPU Inference using ALPAKA of Model file [" + fFileName + "] at [" + fParseTime.substr(0, fParseTime.length()-1) +"] \n"); + // add header guards + hgname = fName; + std::transform(hgname.begin(), hgname.end(), hgname.begin(), [](unsigned char c) { + return std::toupper(c); + } ); + hgname = "SOFIE_" + hgname; + fGC += "\n#ifndef " + hgname + "\n"; + fGC += "#define " + hgname + "\n\n"; + for (auto& i: fNeededStdLib) { + fGC += "#include <" + i + ">\n"; + } + for (auto& i: fCustomOpHeaders) { + fGC += "#include \"" + i + "\"\n"; + } + fGC += "#include \n"; + fGC += "#include \n"; + + // for the session we need to include SOFIE_Common functions + //needed for convolution operator (need to add a flag) + fGC += "#include \"SOFIE/SOFIE_common.hxx\"\n"; + if (fUseWeightFile) + fGC += "#include \n"; + // Include TFile when saving the weights in a binary ROOT file + if (fWeightFile == WeightFileType::RootBinary) + fGC += "#include \"TFile.h\"\n"; + + fGC += "\nusing Dim1D = alpaka::DimInt<1>;\n"; + fGC += "\nnamespace SOFIE_" + fName + "{\n"; +} + void RModel_Base::OutputGenerated(std::string filename, bool append) { // the model can be appended only if a file name is provided if (filename.empty()) { diff --git a/src/SOFIE_core/src/RModel_GNN.cxx b/src/SOFIE_core/src/RModel_GNN.cxx index a1dfe06..3dae254 100644 --- a/src/SOFIE_core/src/RModel_GNN.cxx +++ b/src/SOFIE_core/src/RModel_GNN.cxx @@ -94,7 +94,7 @@ void RModel_GNN::Generate() { // the number of output edges features can be smaller, so we need to correct here auto num_edge_features_input = num_edge_features; - auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) { num_edge_features = edges_update_output_shape[1].dim; } @@ -117,7 +117,7 @@ void RModel_GNN::Generate() { // we need to correct the output number of node features auto num_node_features_input = num_node_features; - auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) { num_node_features = nodes_update_output_shape[1].dim; } diff --git a/src/SOFIE_core/src/RModel_GraphIndependent.cxx b/src/SOFIE_core/src/RModel_GraphIndependent.cxx index bab06b3..cd62d0c 100644 --- a/src/SOFIE_core/src/RModel_GraphIndependent.cxx +++ b/src/SOFIE_core/src/RModel_GraphIndependent.cxx @@ -81,7 +81,7 @@ void RModel_GraphIndependent::Generate() { // the number of output edges features can be smaller, so we need to correct here // assume num_edge_features is not a parametric shape - auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto edges_update_output_shape = edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) { num_edge_features = edges_update_output_shape[1].dim; } @@ -100,7 +100,7 @@ void RModel_GraphIndependent::Generate() { fGC+="};\n}\n"; // we need to correct the output number of node features - auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto nodes_update_output_shape = nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) { num_node_features = nodes_update_output_shape[1].dim; } @@ -119,7 +119,7 @@ void RModel_GraphIndependent::Generate() { // we need to correct the output number of global features // global features are in shape[1] #if 0 - auto globals_update_output_shape = globals_update_block->GetFunctionBlock()->GetDynamicTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); + auto globals_update_output_shape = globals_update_block->GetFunctionBlock()->GetDimTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]); if(!globals_update_output_shape[1].isParam && globals_update_output_shape[1].dim != num_global_features_input) { num_global_features = globals_update_output_shape[1].dim; } diff --git a/src/SOFIE_core/src/SOFIE_common.cxx b/src/SOFIE_core/src/SOFIE_common.cxx index ad74313..cd1b60a 100644 --- a/src/SOFIE_core/src/SOFIE_common.cxx +++ b/src/SOFIE_core/src/SOFIE_common.cxx @@ -1,15 +1,18 @@ #include "SOFIE/SOFIE_common.hxx" -#include + +#include #include #include +#include +#include +#include - -namespace SOFIE{ +namespace SOFIE { /// @brief Convert shape from integer format to dynamic one (based on Dim) /// @param shape /// @return shape based on Dim -std::vector ConvertShapeToDim(std::vector shape){ +std::vector ConvertShapeToDim(const std::vector & shape){ std::vector ret_shape(shape.size()); for (size_t i =0; i < shape.size(); i++){ ret_shape[i].dim = shape[i]; @@ -20,7 +23,7 @@ std::vector ConvertShapeToDim(std::vector shape){ /// @brief Convert shape based on Dim to integer format /// @param shape /// @return shape based on integer. Return an empty shape in case shape is dynamic (has a parameter) -std::vector ConvertShapeToInt(std::vector shape){ +std::vector ConvertShapeToInt(const std::vector & shape){ std::vector ret_shape(shape.size()); for (size_t i =0; i < shape.size(); i++){ if (shape[i].isParam) { @@ -46,7 +49,7 @@ std::vector ConvertShapeToInt(std::vector shape){ } -std::size_t ConvertShapeToLength(std::vector shape){ +std::size_t ConvertShapeToLength(const std::vector & shape){ // Empty shape represent scalar values, so we return a length=1 std::size_t fLength = 1; for (auto& dim: shape) fLength *= dim; @@ -58,6 +61,9 @@ std::string ConvertTypeToString(ETensorType type){ case ETensorType::FLOAT : { return "float"; } + case ETensorType::INT8 : { + return "int8_t"; + } case ETensorType::INT16 : { return "int16_t"; } @@ -67,6 +73,9 @@ std::string ConvertTypeToString(ETensorType type){ case ETensorType::INT64 : { return "int64_t"; } + case ETensorType::UINT8 : { + return "uint8_t"; + } case ETensorType::UINT16 : { return "uint16_t"; } @@ -80,7 +89,7 @@ std::string ConvertTypeToString(ETensorType type){ return "double"; } case ETensorType::BOOL : { - return "bool"; + return "uint8_t"; } default:{ return "other_" + std::to_string( (int) type); @@ -106,7 +115,7 @@ ETensorType ConvertStringToType(std::string type){ } } -std::string ConvertShapeToString(std::vector shape) { +std::string ConvertShapeToString(const std::vector & shape) { std::stringstream out; out << "{ "; for (size_t i = 0; i < shape.size(); i++) { @@ -117,41 +126,49 @@ std::string ConvertShapeToString(std::vector shape) { return out.str(); } -std::string ConvertDynamicShapeToString(std::vector shape) { +std::string ConvertDimShapeToString(const std::vector & shape) { std::stringstream out; out << "{ "; for (size_t i = 0; i < shape.size(); i++) { - out << shape[i].GetVal(); + out << shape[i]; if (i < shape.size()-1) out << " , "; } out << " }"; return out.str(); } -std::string ConvertDynamicShapeToLength(std::vector shape) { +std::string ConvertDimShapeToLength(const std::vector & shape) { // convert generic shape to a string // multiply all the integer specified dimensions of the shape std::string length; - size_t int_length = 0; + // case of empty vectors return 1 + if (shape.empty()) return "1"; + int64_t int_length = -1; for (size_t i = 0; i < shape.size(); i++) { if (shape[i].isParam) { if (!length.empty()) length += " * "; length += shape[i].param; } else { - if (int_length == 0) + if (int_length == -1) int_length = shape[i].dim; else int_length *= shape[i].dim; } } // multiply the integer components to the parametric one - if (int_length > 0) { - if (!length.empty()) length += " * "; - length += std::to_string(int_length); + // if larger than 1 - otherwise returns -1 + if (int_length >= 0) { + if (!length.empty() && int_length > 1) { + length += " * "; + length += std::to_string(int_length); + } else if (length.empty()) { // case is full known shape + length = std::to_string(int_length); + } } return length; } + namespace{ template static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* input, T* target){ //only visible within this translation unit @@ -169,6 +186,12 @@ static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* inp } } +bool IsInteger(const std::string & s) { + int value; + auto [ptr, ec] = std::from_chars(s.data(), s.data() + s.size(), value); + return ec == std::errc() && ptr == s.data() + s.size(); +} + bool UTILITY::AreSameShape(const std::vector& shapeA, const std::vector& shapeB) { if (shapeA.size() != shapeB.size()) { return false; @@ -330,17 +353,24 @@ std::vector UTILITY::MultidirectionalBroadcastShape(std::vector UTILITY::UnidirectionalBroadcastShape(std::vector shapeA, std::vector shapeB) +// check multi-directional broadcasting of two shapes (need to pass inputs by non const ref. since we might prepends with one's +// return a pair of integer flag and new broadcasted shape +// if flag = 0: shape are identical +// flag = 1: return shape is equal to A, we broadcast B +// flag = 2: return shape is equal to B we broadcast A +// flag = 3: return shape is common of two we broadcast A and B to output +std::pair> UTILITY::MultidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) { size_t sizeA = shapeA.size(); size_t sizeB = shapeB.size(); // Check if A and B have the same shape if (UTILITY::AreSameShape(shapeA, shapeB)){ - return shapeA; + return std::make_pair(0, shapeA); } // Find the common shape of A and B size_t size = std::max(sizeA, sizeB); if (sizeA < size) { + // prepend 1's in A to make of same shape as B std::vector newShapeA(size, 1); size_t offset = size - sizeA; std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset); @@ -359,36 +389,117 @@ std::vector UTILITY::UnidirectionalBroadcastShape(std::vector s break; } } + int broadcastFlag = 0; if (broadcastable) { // The output shape is max(outShape, targetShape) std::vector targetShape(size, 1); for (size_t i = 0; i < size; i++) { targetShape[i] = std::max(shapeA[i], shapeB[i]); + if (shapeB[i] < targetShape[i]) broadcastFlag |= 1; + if (shapeA[i] < targetShape[i]) broadcastFlag |= 2; } - return targetShape; + return std::make_pair(broadcastFlag, targetShape); } else { throw - std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape " + std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB) + " to a common shape."); } } +// unidirectional broadcast- of shape A to target B +std::vector UTILITY::UnidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) +{ + auto ret = UTILITY::MultidirectionalBroadcastShape(shapeB, shapeA); + if (ret.first > 1) { + throw + std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape " + + ConvertShapeToString(shapeA) + " to " + ConvertShapeToString(shapeB) + + " in a common shape."); + } + return ret.second; +} + +// for broadcasting Dim shapes +// flag indicates also which vector needs to be broadcasted +// flag & 1 == 1 : broadcast B -> A +// flag & 2 == 2 : broadcast A -> B +// flag & 4 == 4 a run time check is needed on shapes with values +std::pair> UTILITY::MultidirectionalBroadcastShape(std::vector & shapeA, std::vector & shapeB) { + size_t sizeA = shapeA.size(); + size_t sizeB = shapeB.size(); + // Check if A and B have the same shape + if (UTILITY::AreSameShape(shapeA, shapeB)){ + return std::make_pair(0, shapeA); + } + // Find the common shape of A and B + size_t size = std::max(sizeA, sizeB); + if (sizeA < size) { + // prepend 1's in A to make of same shape as B + std::vector newShapeA(size, Dim{1}); + size_t offset = size - sizeA; + std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset); + shapeA = std::move(newShapeA); + } + if (sizeB < size) { + std::vector newShapeB(size, Dim{1}); + size_t offset = size - sizeB; + std::copy(shapeB.begin(), shapeB.end(), newShapeB.begin() + offset); + shapeB = std::move(newShapeB); + } -// UNidirectional boradcast specializaiton for vector - -// specialization for vector of boolean -void UTILITY::UnidirectionalBroadcast(const std::vector & data, const std::vector& shape, const std::vector& targetShape, std::vector & broadcastedData) - { - // Prepend shape with ones - auto ncdata = const_cast &>(data); - if (shape.size() < targetShape.size()) { - size_t targetSize = targetShape.size(); - std::vector newShape(targetSize, 1); - size_t offset = targetSize - shape.size(); - std::copy(shape.begin(), shape.end(), newShape.begin() + offset); - UTILITY::BroadcastTensor &, std::vector &>(ncdata, newShape, targetShape, broadcastedData); - } - UTILITY::BroadcastTensor &, std::vector &>(ncdata, shape, targetShape, broadcastedData); + int broadcastFlag = 0; + // The output shape is targetShape + std::vector targetShape(size); + for (size_t i = 0; i < size; i++) { + // assume we broadcast to the parametric value + if (shapeA[i] == shapeB[i]) { + targetShape[i] = shapeA[i]; + } else if (shapeA[i].isParam && shapeB[i].GetVal() == "1" ) { + // broadcast B to A (case A is parametric with ) + targetShape[i] = shapeA[i]; + broadcastFlag |= 1; + } else if (shapeA[i].GetVal() == "1" && shapeB[i].isParam) { + // broadcast A to B + targetShape[i] = shapeB[i]; + broadcastFlag |= 2; + } else if (!shapeA[i].isParam && !shapeB[i].isParam) { + if (shapeB[i].dim == 1) { + targetShape[i] = shapeA[i]; + broadcastFlag |= 1; + } else if (shapeA[i].dim == 1) { + targetShape[i] = shapeB[i]; + broadcastFlag |= 2; + } else { + // non broadcastable case cannot have A and B two different defined shapes different than one + broadcastFlag = -1; + } + } else if (shapeA[i].isParam && shapeB[i].isParam) { + // full dynamic case - we will decided at run time + std::stringstream s; + s << "std::max(" << shapeA[i] << "," << shapeB[i] << ")"; + // use -1 for dim to indicate is an expression + targetShape[i] = Dim { s.str() , static_cast(-1)}; + broadcastFlag |= 4; + } else if (shapeA[i].isParam && !shapeB[i].isParam) { + // A -> B need to check at run time if consistent + targetShape[i] = shapeB[i]; + broadcastFlag |= 6; + } else if (!shapeA[i].isParam && shapeB[i].isParam) { + // B -> A need to check at run time if consistent + targetShape[i] = shapeA[i]; + broadcastFlag |= 5; + } else { + // all cases should be covered + throw std::runtime_error("TMVA::SOFIE - Fatal error in MultiDirectionalBroadCastDimShape"); + } + } + if (broadcastFlag == -1) { + throw std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " + + ConvertDimShapeToString(shapeA) + " and " + ConvertDimShapeToString(shapeB) + + " to a common shape."); + } + + return std::make_pair(broadcastFlag, targetShape); } std::string UTILITY::Clean_name(std::string input_tensor_name){ @@ -413,15 +524,146 @@ std::vector UTILITY::ComputeStrideFromShape(const std::vector & shape) // assume row major layout const auto size = shape.size(); std::vector strides(size); - strides[size-1] = Dim{1}; - for (std::size_t i = 1; i < size; i++) { - if (!shape[size-i].isParam && !strides[size-i].isParam) - strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim}; - else - strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())}; + if (size > 0) { + strides[size-1] = Dim{1}; + for (std::size_t i = 1; i < size; i++) { + if (!shape[size-i].isParam && !strides[size-i].isParam) + strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim}; + else { + if (strides[size-i].GetVal() == "1") + strides[size - 1 - i] = shape[size-i]; + else if (shape[size-i].GetVal() == "1") + strides[size - 1 - i] = strides[size-i]; + else + strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())}; + } + } } return strides; } +struct FreeBlock { + std::size_t offset; + std::size_t size; + bool operator<(const FreeBlock& other) const { + // order by offset for deterministic coalescing + return offset < other.offset; + } +}; + +struct MemoryEvent { + int t; // time (i.e. operator index) + int type; // 0 = END first, 1 = START + int idx; // tensor index + bool operator<(const MemoryEvent& o) const { + if (t != o.t) return t < o.t; + return type < o.type; // END before START at the same time + } +}; + +/// Greedy best-fit planner with coalescing free list. +MemoryResult OrganizeMemory(const std::vector & tensorsInfo ) +{ + // Basic validation + for (const auto &t : tensorsInfo) { + if (!(t.end > t.begin)) { + throw std::runtime_error("Each tensor must have end > begin."); + } + } + + // Build events: free before allocate at equal times. + std::vector events; + events.reserve(tensorsInfo.size() * 2); + for (int i = 0; i < (int)tensorsInfo.size(); ++i) { + events.push_back({tensorsInfo[i].end, 0, i}); // END + events.push_back({tensorsInfo[i].begin, 1, i}); // START + } + std::sort(events.begin(), events.end()); + + std::vector tensorsOffset(tensorsInfo.size()); + + // Free list ordered by offset (for O(log n) coalescing) + // and faster insert/erase with respect to a vector + std::set free_list; + + // Bookkeeping: size/offset map for frees. + std::unordered_map live_size; + std::unordered_map live_offset; + + std::size_t total_bytes = 0; + + auto allocate_best_fit = [&](std::size_t need) -> std::size_t { + // Find the *smallest* block whose size >= need (best-fit). + // Since free_list is ordered by offset, we scan to find best by size. + // (For very large sets you could maintain a multimap by size as well.) + auto best = free_list.end(); + for (auto it = free_list.begin(); it != free_list.end(); ++it) { + if (it->size >= need) { + if (best == free_list.end() || it->size < best->size) + best = it; + } + } + if (best != free_list.end()) { + std::size_t off = best->offset; + if (best->size == need) { + free_list.erase(best); + } else { + FreeBlock updated{best->offset + need, best->size - need}; + free_list.erase(best); + free_list.insert(updated); + } + return off; + } + // No free block large enough; grow the heap. + std::size_t off = total_bytes; + total_bytes += need; + return off; + }; + + auto try_coalesce = [&](std::set::iterator it) { + // Coalesce with previous + if (it != free_list.begin()) { + auto prev = std::prev(it); + if (prev->offset + prev->size == it->offset) { + FreeBlock merged{prev->offset, prev->size + it->size}; + free_list.erase(prev); + it = free_list.erase(it); + it = free_list.insert(merged).first; + } + } + // Coalesce with next + auto next = std::next(it); + if (next != free_list.end() && it->offset + it->size == next->offset) { + FreeBlock merged{it->offset, it->size + next->size}; + free_list.erase(next); + it = free_list.erase(it); + free_list.insert(merged); + } + }; + + // Sweep through time. + for (const auto &e : events) { + if (e.type == 0) { // END: free + auto it_sz = live_size.find(e.idx); + auto it_off = live_offset.find(e.idx); + if (it_sz != live_size.end() && it_off != live_offset.end()) { + FreeBlock fb{it_off->second, it_sz->second}; + // Insert and coalesce with neighbors + auto it = free_list.insert(fb).first; + try_coalesce(it); + live_size.erase(it_sz); + live_offset.erase(it_off); + } + } else { // START: allocate + auto &t = tensorsInfo[e.idx]; + std::size_t off = allocate_best_fit(t.size); + tensorsOffset[e.idx] = off; + live_size[e.idx] = t.size; + live_offset[e.idx] = off; + } + } + + return MemoryResult{total_bytes, std::move(tensorsOffset)}; +} -}//SOFIE +} // namespace SOFIE \ No newline at end of file diff --git a/src/SOFIE_core/test/CMakeLists.txt b/src/SOFIE_core/test/CMakeLists.txt index 34bb49f..fd848df 100644 --- a/src/SOFIE_core/test/CMakeLists.txt +++ b/src/SOFIE_core/test/CMakeLists.txt @@ -1,131 +1,191 @@ -# Copyright (C) 1995-2021, Rene Brun and Fons Rademakers. -# All rights reserved. -# -# For the licensing terms see $ROOTSYS/LICENSE. -# For the list of contributors see $ROOTSYS/README/CREDITS. +cmake_minimum_required(VERSION 3.14) +include(FetchContent) ############################################################################ -# CMakeLists.txt file for building TMVA SOFIE tests. -# @author Federico Sossai, Sanjiban Sengupta +# Basic setup ############################################################################ - include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_core/inc) include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_parsers/inc) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + if (NOT ONNX_MODELS_DIR) set(ONNX_MODELS_DIR input_models) endif() -# Finding .onnx files to be parsed and creating the appropriate code to -# parse all file. It is much faster to combine all parsing in a single executable -# which will avoid initialization time (especially when using ROOT) -set(CAPTURE_STR "EmitModel( \"@1\", \"@2\");") -set(ALL_CAPTURES "") -# Finding .onnx files to be parsed and creating the appropriate command +option(ENABLE_ALPAKA_TESTS "Enable Alpaka-based SOFIE tests" OFF) + +set(ALPAKA_BACKEND "cuda" + CACHE STRING "Alpaka backend to test (cuda, cpu, hip, sycl)") +set_property(CACHE ALPAKA_BACKEND PROPERTY STRINGS cuda cpu hip sycl) + +############################################################################ +# Generate emitter sources +############################################################################ +set(CAPTURE_STR +"try {\n\ + EmitModel(\"@1\", \"@2\");\n\ +} catch (const std::exception& e) {\n\ + std::string msg = e.what();\n\ + if (msg.find(\"multiple output tensors are not supported\") != std::string::npos) {\n\ + std::cerr << \"[SKIP] Multiple outputs are not supported for @1\" << std::endl;\n\ + } else if (msg.find(\"is of a data type which is not yet supported\") != std::string::npos) {\n\ + std::cerr << \"[SKIP] Operatorr with nsupported data type in @1: \" << msg << std::endl;\n\ + } else {\n\ + std::cerr << \"[ERROR] Failed processing @1: \" << msg << std::endl;\n\ + failures++;\n\ + }\n\ +} catch (...) {\n\ + std::cerr << \"[ERROR] Unknown failure processing @1\" << std::endl;\n\ + failures++;\n\ +}\n\ +") + file(GLOB ONNX_FILES "${ONNX_MODELS_DIR}/*.onnx") + +set(ALL_CAPTURES "") foreach(onnx_file ${ONNX_FILES}) get_filename_component(fname ${onnx_file} NAME_WE) - get_filename_component(fdir ${onnx_file} DIRECTORY) - string(REPLACE "@1" ${onnx_file} cap ${CAPTURE_STR}) - string(REPLACE "@2" ${fname} cap ${cap}) - list(APPEND ALL_CAPTURES ${cap}) + string(REPLACE "@1" "${onnx_file}" cap "${CAPTURE_STR}") + string(REPLACE "@2" "${fname}" cap "${cap}") + string(APPEND ALL_CAPTURES "${cap}") endforeach() -string(REPLACE ";" ";\n" EMIT_CAPTURES "${ALL_CAPTURES}") + +set(EMIT_CAPTURES "${ALL_CAPTURES}") + configure_file(EmitFromONNX.cxx.in EmitFromONNX_all.cxx @ONLY) -configure_file(EmitFromRoot.cxx.in EmitFromRoot_all.cxx @ONLY) +configure_file(EmitFromONNX_GPU_ALPAKA.cxx.in EmitFromONNX_GPU_ALPAKA_all.cxx @ONLY) + +############################################################################ +# Alpaka tests +############################################################################ +if (ENABLE_ALPAKA_TESTS) + + string(TOLOWER "${ALPAKA_BACKEND}" _alpaka_backend) + if (NOT _alpaka_backend IN_LIST ALPAKA_BACKEND) + message(FATAL_ERROR "Unsupported ALPAKA_BACKEND=${ALPAKA_BACKEND}") + endif() + + FetchContent_Declare( + sofieBLAS + GIT_REPOSITORY https://github.com/ML4EP/sofieBLAS + GIT_TAG dev + ) + FetchContent_MakeAvailable(sofieBLAS) -ROOTTEST_GENERATE_EXECUTABLE(emitFromONNX EmitFromONNX_all.cxx + FetchContent_Declare( + alpaka + GIT_REPOSITORY https://github.com/alpaka-group/alpaka + GIT_TAG 2fa91a34ed11b2076e474c5507d920e85cf9b79d + ) + FetchContent_MakeAvailable(alpaka) + + ########################################################################## + # Alpaka emitter + ########################################################################## + ROOTTEST_GENERATE_EXECUTABLE( + emitFromONNXAlpaka + EmitFromONNX_GPU_ALPAKA_all.cxx LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers - FIXTURES_SETUP sofie-compile-models-onnx-build) - -# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17 -target_compile_options(emitFromONNX PRIVATE -Wno-unused-parameter -Wno-array-bounds) - -ROOTTEST_ADD_TEST(SofieCompileModels_ONNX - COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNX ${onnx_file} ${CMAKE_CURRENT_BINARY_DIR}/${fname} - FIXTURES_REQUIRED sofie-compile-models-onnx-build - FIXTURES_SETUP sofie-compile-models-onnx -) - -# Creating a Google Test -if (BLAS_FOUND) # we need BLAS for compiling the models - ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx - LIBRARIES - MathCore - SOFIE_core - BLAS::BLAS - GTest::gtest - GTest::gtest_main - FIXTURES_REQUIRED - sofie-compile-models-onnx - FIXTURES_SETUP - sofie-test-models-onnx-build + FIXTURES_SETUP sofie-compile-models-onnx-alpaka-build ) - target_include_directories(TestCustomModelsFromONNX PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - ROOTTEST_ADD_TEST(TestCustomModelsFromONNX - EXEC ./TestCustomModelsFromONNX - FIXTURES_REQUIRED sofie-test-models-onnx-build) -endif() -# For testing serialisation of RModel object - -ROOTTEST_GENERATE_EXECUTABLE(emitFromROOT EmitFromRoot_all.cxx - LIBRARIES protobuf::libprotobuf RIO SOFIE_core SOFIE_parsers - FIXTURES_SETUP sofie-compile-models-onnx-root -) -# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17 -target_compile_options(emitFromROOT PRIVATE -Wno-unused-parameter -Wno-array-bounds) - -# Automatic compilation of headers from root files -ROOTTEST_ADD_TEST(SofieCompileModels_ROOT - COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromROOT - FIXTURES_REQUIRED sofie-compile-models-onnx-root - FIXTURES_SETUP sofie-compile-models-root -) - -if (BLAS_FOUND) - # Creating a Google Test for Serialisation of RModel - ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromROOT TestCustomModelsFromROOT.cxx - LIBRARIES - SOFIE_core - BLAS::BLAS - GTest::gtest - GTest::gtest_main - FIXTURES_REQUIRED - sofie-compile-models-root - FIXTURES_SETUP - sofie-test-models-root-build + target_compile_options(emitFromONNXAlpaka PRIVATE + -Wno-unused-parameter + -Wno-array-bounds ) - target_include_directories(TestCustomModelsFromROOT PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - ROOTTEST_ADD_TEST(TestCustomModelsFromROOT - EXEC ./TestCustomModelsFromROOT - FIXTURES_REQUIRED sofie-test-models-root-build) -endif() -# Look for needed Python modules -ROOT_FIND_PYTHON_MODULE(torch) -if (ROOT_TORCH_FOUND) - configure_file(Conv1dModelGenerator.py Conv1dModelGenerator.py COPYONLY) - configure_file(Conv2dModelGenerator.py Conv2dModelGenerator.py COPYONLY) - configure_file(Conv3dModelGenerator.py Conv3dModelGenerator.py COPYONLY) - configure_file(ConvTrans2dModelGenerator.py ConvTrans2dModelGenerator.py COPYONLY) - configure_file(LinearModelGenerator.py LinearModelGenerator.py COPYONLY) - configure_file(RecurrentModelGenerator.py RecurrentModelGenerator.py COPYONLY) - - if (BLAS_FOUND) - ROOT_ADD_GTEST(TestSofieModels TestSofieModels.cxx - LIBRARIES - SOFIE_core - SOFIE_parsers - BLAS::BLAS - INCLUDE_DIRS - ${CMAKE_CURRENT_BINARY_DIR} + ROOTTEST_ADD_TEST( + SofieCompileModels_ONNX_Alpaka + COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNXAlpaka + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka-build + FIXTURES_SETUP sofie-compile-models-onnx-alpaka + ) + + ########################################################################## + # CUDA backend + ########################################################################## + if (_alpaka_backend STREQUAL "cuda") + + message(STATUS "Enabling Alpaka CUDA tests") + + enable_language(CUDA) + find_package(CUDAToolkit REQUIRED) + + set_source_files_properties( + TestCustomModelsFromONNXForAlpakaCuda.cxx + PROPERTIES LANGUAGE CUDA + ) + + ROOTTEST_GENERATE_EXECUTABLE( + TestCustomModelsFromONNXForAlpakaCuda + TestCustomModelsFromONNXForAlpakaCuda.cxx + LIBRARIES MathCore SOFIE_core GTest::gtest GTest::gtest_main + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka + FIXTURES_SETUP sofie-test-models-onnx-alpaka-build ) - endif() -endif() -ROOT_EXECUTABLE(emitGNN GNN/EmitGNN.cxx LIBRARIES SOFIE_core) -ROOT_ADD_TEST(tmva-sofie-EmitGNN COMMAND emitGNN) + target_include_directories( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + ${CMAKE_CURRENT_BINARY_DIR} + ${alpaka_SOURCE_DIR}/include + ${sofieblas_SOURCE_DIR}/include + ${ROOT_INCLUDE_DIRS} + ${CUDAToolkit_INCLUDE_DIRS} + ${CMAKE_CURRENT_SOURCE_DIR} + ) + + set_target_properties( + TestCustomModelsFromONNXForAlpakaCuda + PROPERTIES + CUDA_SEPARABLE_COMPILATION OFF + CUDA_ARCHITECTURES 70 80 86 + ) + + target_compile_definitions( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + ALPAKA_ACC_GPU_CUDA_ENABLED + ALPAKA_HAS_STD_ATOMIC_REF + ) + + target_compile_options( + TestCustomModelsFromONNXForAlpakaCuda PRIVATE + $<$: + --extended-lambda + --expt-relaxed-constexpr + --generate-line-info + --use_fast_math + -g + -G + # -fsanitize=address + -O1 + -Wno-deprecated-gpu-targets + > + $<$: + -O2 + -g + -G + -fPIC + -pthread + > + ) + # set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address") + + # ROOT-compatible: plain signature only + target_link_libraries( + TestCustomModelsFromONNXForAlpakaCuda + CUDA::cudart + CUDA::cublas + CUDA::cublasLt + ${ROOT_LIBRARIES} + ) + + ROOTTEST_ADD_TEST( + TestCustomModelsFromONNXForAlpakaCuda + EXEC ./TestCustomModelsFromONNXForAlpakaCuda + FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka + ) -ROOT_EXECUTABLE(EmitGraphIndependent GNN/EmitGraphIndependent.cxx LIBRARIES SOFIE_core) -ROOT_ADD_TEST(tmva-sofie-EmitGraphIndependent COMMAND EmitGraphIndependent) + endif() # cuda backend +endif() # ENABLE_ALPAKA_TESTS diff --git a/src/SOFIE_core/test/EmitFromONNX.cxx.in b/src/SOFIE_core/test/EmitFromONNX.cxx.in index f7a56e2..c464f4d 100644 --- a/src/SOFIE_core/test/EmitFromONNX.cxx.in +++ b/src/SOFIE_core/test/EmitFromONNX.cxx.in @@ -23,7 +23,13 @@ int EmitModel(std::string filename, std::string outname) { int main(int argc, char *argv[]){ -@EMIT_CAPTURES@ ; + + int failures = 0; + + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ONNX] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; } diff --git a/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in b/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in new file mode 100644 index 0000000..58198c1 --- /dev/null +++ b/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in @@ -0,0 +1,27 @@ +// Author: Sanjiban Sengupta + +#include "SOFIE/RModel_Base.hxx" +#include "SOFIE/RModel.hxx" +#include "SOFIE/RModelParser_ONNX.hxx" + +using namespace SOFIE; + +int EmitModel(std::string filename, std::string outname) { + + RModelParser_ONNX parser; + RModel model = parser.Parse(filename); + model.GenerateGPU_ALPAKA(); + model.OutputGenerated(outname+"_FromONNX_GPU_ALPAKA.hxx"); + + return 0; +} + +int main(int argc, char *argv[]) { + + int failures = 0; + + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ONNX with ALPAKA] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; +} diff --git a/src/SOFIE_core/test/EmitFromRoot.cxx.in b/src/SOFIE_core/test/EmitFromRoot.cxx.in index 4a630c7..88c0789 100644 --- a/src/SOFIE_core/test/EmitFromRoot.cxx.in +++ b/src/SOFIE_core/test/EmitFromRoot.cxx.in @@ -43,6 +43,15 @@ int EmitModel(std::string inputfile, std::string outname){ int main(int argc, char *argv[]){ -@EMIT_CAPTURES@ ; + int failures = 0; + @EMIT_CAPTURES@ + + std::cout << "[SUMMARY for generation from ROOT] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; + + @EMIT_CAPTURES@; + + std::cout << "[SUMMARY] Completed with " << failures << " failures" << std::endl; + return failures == 0 ? 0 : 1; } diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx b/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx index d02dc5e..14eb6a3 100644 --- a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx +++ b/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx @@ -812,7 +812,7 @@ TEST(ONNX, LinearWithLeakyRelu) { constexpr float TOLERANCE = 1; - // Preparing the standard all-ones input + // Preparing input std::vector input({ 0.4369, -0.6882, 1.0309, -1.0263, -0.1519, 1.2237, -0.7054, -0.1762, -0.6811, -2.2597, 1.0388, -0.7993, 0.1468, 1.3257, -0.4714, -0.0958, @@ -2515,7 +2515,7 @@ TEST(ONNX, Equal){ }); SOFIE_Equal::Session s("Equal_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool)); @@ -2540,7 +2540,7 @@ TEST(ONNX, LessOrEqual){ }); SOFIE_LessOrEqual::Session s("LessOrEqual_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool)); @@ -2565,7 +2565,7 @@ TEST(ONNX, GreaterOrEqual){ }); SOFIE_GreaterOrEqual::Session s("GreaterOrEqual_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool)); @@ -2590,7 +2590,7 @@ TEST(ONNX, Greater){ }); SOFIE_Greater::Session s("Greater_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool)); @@ -2615,7 +2615,7 @@ TEST(ONNX, Less){ }); SOFIE_Less::Session s("Less_FromONNX.dat"); - std::vector output = s.infer(input1.data(),input2.data()); + std::vector output = s.infer(input1.data(),input2.data()); // Checking output size EXPECT_EQ(output.size(), sizeof(Less_ExpectedOutput::outputs) / sizeof(bool)); diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx new file mode 100644 index 0000000..1303251 --- /dev/null +++ b/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx @@ -0,0 +1,1096 @@ +#include +#include + +#include "Linear_64_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Linear_64.ref.hxx" + +#include "AddBroadcast1_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/AddBroadcast1.ref.hxx" + +#include "LinearWithLeakyRelu_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/LinearWithLeakyRelu.ref.hxx" + +#include "LinearWithSigmoid_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/LinearWithSigmoid.ref.hxx" + +#include "Transpose_FromONNX_GPU_ALPAKA.hxx" + +#include "Concat_0D_FromONNX_GPU_ALPAKA.hxx" +#include "ScatterElements_FromONNX_GPU_ALPAKA.hxx" + +#include "Split_0_FromONNX_GPU_ALPAKA.hxx" +#include "Split_1_FromONNX_GPU_ALPAKA.hxx" +#include "Split_2_FromONNX_GPU_ALPAKA.hxx" + +#include "Tile5D_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/Tile5D.ref.hxx" + +#include "GatherAxis0_FromONNX_GPU_ALPAKA.hxx" +#include "GatherAxis1_FromONNX_GPU_ALPAKA.hxx" +#include "GatherAxis2_FromONNX_GPU_ALPAKA.hxx" +#include "GatherAxis3_FromONNX_GPU_ALPAKA.hxx" +#include "Gather2d_FromONNX_GPU_ALPAKA.hxx" +#include "GatherNegativeIndices_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/GatherAxis0.ref.hxx" +#include "input_models/references/GatherAxis1.ref.hxx" +#include "input_models/references/GatherAxis2.ref.hxx" +#include "input_models/references/GatherAxis3.ref.hxx" +#include "input_models/references/Gather2d.ref.hxx" +#include "input_models/references/GatherNegativeIndices.ref.hxx" + +#include "ExpandSameSize_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ExpandSameSize.ref.hxx" + +#include "ExpandDiffSize_FromONNX_GPU_ALPAKA.hxx" +#include "input_models/references/ExpandDiffSize.ref.hxx" + +#include "GatherND_Ex1_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex2_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex3_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex4_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Ex5_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.hxx" +#include "GatherND_Batch_FromONNX_GPU_ALPAKA.hxx" + +#include +#include +#include +#include "gtest/gtest.h" + +constexpr float DEFAULT_TOLERANCE = 1e-3f; + +using Idx = std::size_t; +using Dim = alpaka::DimInt<1>; +using Ext1D = alpaka::Vec; + +class SofieAlpakaTest : public ::testing::Test { +protected: + // Shared devices and platforms + alpaka::PlatformCpu hostPlatform; + alpaka::DevCpu host; + alpaka::PlatformCudaRt platform; + alpaka::DevCudaRt device; + alpaka::Queue queue; + + SofieAlpakaTest() + : hostPlatform{} + , host(alpaka::getDevByIdx(hostPlatform, 0u)) + , platform{} + , device(alpaka::getDevByIdx(platform, 0u)) + , queue(device) + { + } + + void SetUp() override { + cudaDeviceSynchronize(); + } + + void TearDown() override { + alpaka::wait(queue); + cudaDeviceSynchronize(); + } + + ~SofieAlpakaTest() override { + cudaDeviceSynchronize(); + } +}; + + +// TEST_F(SofieAlpakaTest, Linear64) +// { +// constexpr float TOLERANCE = DEFAULT_TOLERANCE; + +// auto A = alpaka::allocBuf(host, Ext1D::all(Idx{1600})); +// float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + +// for (Idx i = 0; i < 1600; ++i) { +// A_ptr[i] = 1.0; +// } + +// auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{1600})); +// alpaka::memcpy(queue, A_d, A); +// alpaka::wait(queue); + +// auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{160})); + +// { +// SOFIE_Linear_64::Session session("Linear_64_FromONNX_GPU_ALPAKA.dat"); +// auto result = session.infer(A_d); +// alpaka::wait(queue); +// cudaDeviceSynchronize(); + +// alpaka::memcpy(queue, result_h, result); +// alpaka::wait(queue); +// } + +// float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); +// float *correct = Linear_64_ExpectedOutput::all_ones; + +// for (size_t i = 0; i < 160; ++i) { +// EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +// } +// } + +TEST_F(SofieAlpakaTest, LinearWithLeakyRelu) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({ + 0.4369, -0.6882, 1.0309, -1.0263, -0.1519, 1.2237, -0.7054, -0.1762, + -0.6811, -2.2597, 1.0388, -0.7993, 0.1468, 1.3257, -0.4714, -0.0958, + 0.7057, -0.3749, -0.3310, 0.0986, -0.1370, 0.0832, -1.6465, -0.2793 + }); + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < input.size(); ++i) { + A_ptr[i] = input[i]; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{24})); + + { + SOFIE_LinearWithLeakyRelu::Session session; + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = LinearWithLeakyRelu_ExpectedOutput::outputs; + + for (size_t i = 0; i < 24; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, LinearWithSigmoid) +{ + + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{48})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + for (Idx i = 0; i < 48; ++i) { + A_ptr[i] = 1.0; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{48})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{24})); + + { + SOFIE_LinearWithSigmoid::Session session("LinearWithSigmoid_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(A_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = LinearWithSigmoid_ExpectedOutput::all_ones; + for (size_t i = 0; i < 24; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, AddBroadcast1) +{ + + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + auto A = alpaka::allocBuf(host, Ext1D::all(Idx{5})); + float *A_ptr = reinterpret_cast(alpaka::getPtrNative(A)); + + auto B = alpaka::allocBuf(host, Ext1D::all(Idx{20})); + float *B_ptr = reinterpret_cast(alpaka::getPtrNative(B)); + + std::vector A_vec({-0.78023305, -1.34029483, -3.01482951, 0.53641361, + -1.22594789}); + std::vector B_vec({1.0626695, 0.43842875, 1.22476468, 0.79763274, 0.98688211, + 0.25267614, 0.44874883, 0.31516773, -0.78771195, 0.64565664, + 0.50450593, -0.41265227, -0.22474539, -0.22362374, 0.00509674, + 0.16927211, 1.06756969, -0.81634773, 0.88467744, 0.78902059}); + + for (Idx i = 0; i < A_vec.size(); ++i) { + A_ptr[i] = A_vec[i]; + } + + for (Idx i = 0; i < B_vec.size(); ++i) { + B_ptr[i] = B_vec[i]; + } + + auto A_d = alpaka::allocBuf(device, Ext1D::all(Idx{5})); + alpaka::memcpy(queue, A_d, A); + alpaka::wait(queue); + + auto B_d = alpaka::allocBuf(device, Ext1D::all(Idx{20})); + alpaka::memcpy(queue, B_d, B); + alpaka::wait(queue); + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{20})); + + { + SOFIE_AddBroadcast1::Session session; + auto result = session.infer(A_d, B_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float *correct = AddBroadcast1_ExpectedOutput::output; + for (size_t i = 0; i < 20; ++i) { + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, Transpose) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // Input shape: (2, 1, 3, 4) -> 24 elements + constexpr Idx inputSize = 24; + // Output shape: (2, 3, 4, 1) -> 24 elements + constexpr Idx outputSize = 24; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + + std::vector input_vec({ + // shape (2, 1, 3, 4) + 0.f, 1.f, 2.f, 3.f, + 4.f, 5.f, 6.f, 7.f, + 8.f, 9.f, 10.f, 11.f, + + 12.f, 13.f, 14.f, 15.f, + 16.f, 17.f, 18.f, 19.f, + 20.f, 21.f, 22.f, 23.f + }); + + for (Idx i = 0; i < inputSize; ++i) + input_ptr[i] = input_vec[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Transpose::Session session; + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + std::vector expected(outputSize); + std::vector inputShape = {2, 1, 3, 4}; + std::vector perm = {0, 2, 3, 1}; + std::vector outputShape = {2, 3, 4, 1}; + + std::vector inputStrides = {12, 12, 4, 1}; + std::vector outputStrides = {12, 4, 1, 1}; + + for (size_t i = 0; i < outputSize; ++i) + { + size_t remaining = i; + size_t inputIdx = 0; + for (size_t d = 0; d < 4; ++d) + { + size_t const coord = remaining / outputStrides[d]; + remaining = remaining - coord * outputStrides[d]; + inputIdx += coord * inputStrides[perm[d]]; + } + expected[i] = input_vec[inputIdx]; + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Concat0D) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({1.40519865e+00, -2.87660856e-01}); + std::vector expected_output({ + 1.40519865e+00, -2.87660856e-01, + 1.40519865e+00, -2.87660856e-01 + }); + + // Host input buffer + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + + for (Idx i = 0; i < input.size(); ++i) + input_ptr[i] = input[i]; + + // Device input buffer + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + // Host output buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected_output.size()})); + + { + SOFIE_Concat_0D::Session session("Concat_0D_FromONNX_GPU_ALPAKA.dat"); + + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + + for (size_t i = 0; i < expected_output.size(); ++i) { + EXPECT_LE(std::abs(res_ptr[i] - expected_output[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, ScatterElements) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input (9, 0.f); + std::vector indices = { 1, 0, 2, 0, 2, 1 }; + std::vector updates = { 1.f, 1.1f, 1.2f, 2.f, 2.1f, 2.2f }; + std::vector correct = { 2.f, 1.1f, 0.f, 1.f, 0.f, 2.2f, 0.f, 2.1f, 1.2f }; + + // Allocate and fill host buffers + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + auto indices_h = alpaka::allocBuf(host, Ext1D::all(Idx{indices.size()})); + auto updates_h = alpaka::allocBuf(host, Ext1D::all(Idx{updates.size()})); + + float* input_ptr = reinterpret_cast (alpaka::getPtrNative(input_h)); + int64_t* indices_ptr = reinterpret_cast(alpaka::getPtrNative(indices_h)); + float* updates_ptr = reinterpret_cast (alpaka::getPtrNative(updates_h)); + + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + for (Idx i = 0; i < indices.size(); ++i) indices_ptr[i] = indices[i]; + for (Idx i = 0; i < updates.size(); ++i) updates_ptr[i] = updates[i]; + + // Allocate device buffers and copy + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + auto indices_d = alpaka::allocBuf(device, Ext1D::all(Idx{indices.size()})); + auto updates_d = alpaka::allocBuf(device, Ext1D::all(Idx{updates.size()})); + + alpaka::memcpy(queue, input_d, input_h); + alpaka::memcpy(queue, indices_d, indices_h); + alpaka::memcpy(queue, updates_d, updates_h); + alpaka::wait(queue); + + // Host result buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct.size()})); + + { + SOFIE_ScatterElements::Session session; + auto result = session.infer(input_d, indices_d, updates_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + EXPECT_EQ(correct.size(), 9u); + for (size_t i = 0; i < correct.size(); ++i){ + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); + } +} + +TEST_F(SofieAlpakaTest, Split_0) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // split in axis 0 in 2 tensors {2,2,3} -> {1,2,3} each + std::vector input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.}; + std::vector> correct_output = { {1.,2.,3.,4.,5.,6.}, {7.,8.,9.,10.,11.,12.} }; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result0_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[0].size()})); + auto result1_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[1].size()})); + + { + SOFIE_Split_0::Session session; + auto [result0, result1] = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result0_h, result0); + alpaka::memcpy(queue, result1_h, result1); + alpaka::wait(queue); + } + + float* res0_ptr = reinterpret_cast(alpaka::getPtrNative(result0_h)); + float* res1_ptr = reinterpret_cast(alpaka::getPtrNative(result1_h)); + + for (size_t j = 0; j < correct_output[0].size(); ++j) + EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE); + for (size_t j = 0; j < correct_output[1].size(); ++j) + EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Split_1) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // split in axis 1 in 2 tensors {2,2,3} -> {2,1,3} each + std::vector input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.}; + std::vector> correct_output = { {1.,2.,3.,7.,8.,9.}, {4.,5.,6.,10.,11.,12.} }; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result0_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[0].size()})); + auto result1_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[1].size()})); + + { + SOFIE_Split_1::Session session; + auto [result0, result1] = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result0_h, result0); + alpaka::memcpy(queue, result1_h, result1); + alpaka::wait(queue); + } + + float* res0_ptr = reinterpret_cast(alpaka::getPtrNative(result0_h)); + float* res1_ptr = reinterpret_cast(alpaka::getPtrNative(result1_h)); + + for (size_t j = 0; j < correct_output[0].size(); ++j) + EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE); + for (size_t j = 0; j < correct_output[1].size(); ++j) + EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Split_2) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + // split in axis 2 in 2 tensors {2,2,3} -> {2,2,2} and {2,2,1} + std::vector input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.}; + std::vector> correct_output = { {1.,2.,4.,5.,7.,8.,10.,11.}, {3.,6.,9.,12.} }; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + // outputs have different sizes: {2,2,2}=8 and {2,2,1}=4 + auto result0_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[0].size()})); + auto result1_h = alpaka::allocBuf(host, Ext1D::all(Idx{correct_output[1].size()})); + + { + SOFIE_Split_2::Session session; + auto [result0, result1] = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result0_h, result0); + alpaka::memcpy(queue, result1_h, result1); + alpaka::wait(queue); + } + + float* res0_ptr = reinterpret_cast(alpaka::getPtrNative(result0_h)); + float* res1_ptr = reinterpret_cast(alpaka::getPtrNative(result1_h)); + + for (size_t j = 0; j < correct_output[0].size(); ++j) + EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE); + for (size_t j = 0; j < correct_output[1].size(); ++j) + EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Tile5D) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input_data({ + 0.2386120855808258, 0.5549510717391968, -1.8190287351608276, 0.5724563598632812, -0.6596977710723877, + 0.17560836672782898, 0.7608169317245483, 0.08603227883577347, -0.049375515431165695, 0.2705111503601074, + 1.42119562625885, 0.032626643776893616, -1.212586522102356, -0.5129594802856445, -0.43296414613723755, + -0.1606937050819397, 1.1884371042251587, -0.662174642086029, -2.291109323501587, -0.6852569580078125, + 2.325223922729492, -0.19389064610004425, -0.5784135460853577, -0.39328137040138245, 0.2831517457962036, + 0.4496127665042877, -0.2029038816690445, 0.35477763414382935, 0.4266718924045563, 0.24683749675750732, + 1.90426504611969, -0.4861580729484558, 0.9139055013656616, -0.5031066536903381, 0.9583520293235779, + -0.23210509121418, 1.3183971643447876, 1.7042455673217773, -0.3201166093349457, -0.14444805681705475, + -0.8829464912414551, 1.725736141204834, 0.45657631754875183, 0.4920198321342468, -1.088847041130066, + 0.49437597393989563, -0.006085286382585764, 2.475630760192871, 0.12170185893774033, -0.8953945636749268, + 1.1430096626281738, 1.3278610706329346, 0.3076854348182678, 0.036237504333257675, 0.05180325731635094, + 0.2802475392818451, 0.5289335250854492, 0.9356630444526672, 0.7863689064979553, 0.4239695370197296, + 0.8723016977310181, -0.2248474359512329, 0.3891502320766449, 0.5463842153549194, -0.7782878875732422, + -0.8570080399513245, -2.593783378601074, -0.11392943561077118, 0.5637082457542419, 2.075004816055298, + -1.0598397254943848, 1.0823975801467896 + }); + + const std::size_t inputSize = input_data.size(); + const std::size_t outputSize = sizeof(Tile5D_ExpectedOutput::output) / sizeof(float); + + // Allocate and fill host input buffer + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < inputSize; ++i) + input_ptr[i] = input_data[i]; + + // Copy to device + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + // Host result buffer + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Tile5D::Session session; + auto result = session.infer(input_d); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Tile5D_ExpectedOutput::output; + + EXPECT_EQ(outputSize, sizeof(Tile5D_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis0) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis0_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis0::Session session("GatherAxis0_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis0_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis0_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis1) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis1_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis1::Session session("GatherAxis1_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis1_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis1_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis2) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis2_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis2::Session session("GatherAxis2_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis2_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis2_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherAxis3) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 120; + const std::size_t outputSize = sizeof(GatherAxis3_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherAxis3::Session session("GatherAxis3_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherAxis3_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherAxis3_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, Gather2d) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 9; + const std::size_t outputSize = sizeof(Gather2d_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_Gather2d::Session session("Gather2d_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = Gather2d_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(Gather2d_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherNegativeIndices) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + constexpr Idx inputSize = 10; + const std::size_t outputSize = sizeof(GatherNegativeIndices_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{inputSize})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + std::iota(input_ptr, input_ptr + inputSize, 0.f); + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{inputSize})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_GatherNegativeIndices::Session session("GatherNegativeIndices_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = GatherNegativeIndices_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(GatherNegativeIndices_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, ExpandSameSize) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({0.f, 1.f, 2.f}); + const std::size_t outputSize = sizeof(ExpandSameSize_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) + input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ExpandSameSize::Session session("ExpandSameSize_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ExpandSameSize_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(ExpandSameSize_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, ExpandDiffSize) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector input({0.f, 1.f, 2.f}); + const std::size_t outputSize = sizeof(ExpandDiffSize_ExpectedOutput::output) / sizeof(float); + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{input.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < input.size(); ++i) + input_ptr[i] = input[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{input.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{outputSize})); + + { + SOFIE_ExpandDiffSize::Session session("ExpandDiffSize_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res_ptr = reinterpret_cast(alpaka::getPtrNative(result_h)); + float* correct = ExpandDiffSize_ExpectedOutput::output; + EXPECT_EQ(outputSize, sizeof(ExpandDiffSize_ExpectedOutput::output) / sizeof(float)); + for (size_t i = 0; i < outputSize; ++i) + EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE); +} + +TEST_F(SofieAlpakaTest, GatherND_Ex1) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f, 1.f, 2.f, 3.f}; + std::vector expected = {0.f, 3.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex1::Session session("GatherND_Ex1_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 2u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex2) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f, 1.f, 2.f, 3.f}; + std::vector expected = {2.f, 3.f, 0.f, 1.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex2::Session session("GatherND_Ex2_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex3) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f}; + std::vector expected = {2.f, 3.f, 4.f, 5.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex3::Session session("GatherND_Ex3_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex4) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f}; + std::vector expected = {2.f, 3.f, 4.f, 5.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex4::Session session("GatherND_Ex4_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Ex5) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f}; + std::vector expected = {2.f, 3.f, 4.f, 5.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Ex5::Session session("GatherND_Ex5_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 4u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_NegativeIndices) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f}; + std::vector expected = {6.f, 2.f, 4.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_NegativeIndices::Session session("GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 3u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} + +TEST_F(SofieAlpakaTest, GatherND_Batch) +{ + constexpr float TOLERANCE = DEFAULT_TOLERANCE; + + std::vector data(24); + std::iota(data.begin(), data.end(), 0.f); + std::vector expected = {4.f,5.f,6.f,7.f, 20.f,21.f,22.f,23.f}; + + auto input_h = alpaka::allocBuf(host, Ext1D::all(Idx{data.size()})); + float* input_ptr = reinterpret_cast(alpaka::getPtrNative(input_h)); + for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i]; + + auto input_d = alpaka::allocBuf(device, Ext1D::all(Idx{data.size()})); + alpaka::memcpy(queue, input_d, input_h); + alpaka::wait(queue); + + auto result_h = alpaka::allocBuf(host, Ext1D::all(Idx{expected.size()})); + + { + SOFIE_GatherND_Batch::Session session("GatherND_Batch_FromONNX_GPU_ALPAKA.dat"); + auto result = session.infer(input_d); + alpaka::wait(queue); + cudaDeviceSynchronize(); + alpaka::memcpy(queue, result_h, result); + alpaka::wait(queue); + } + + float* res = reinterpret_cast(alpaka::getPtrNative(result_h)); + ASSERT_EQ(expected.size(), 8u); + for (size_t i = 0; i < expected.size(); ++i) + EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i; +} diff --git a/src/SOFIE_core/test/input_models/GNN_model.onnx b/src/SOFIE_core/test/input_models/GNN_model.onnx new file mode 100644 index 0000000..833e34d Binary files /dev/null and b/src/SOFIE_core/test/input_models/GNN_model.onnx differ diff --git a/src/SOFIE_core/test/input_models/GatherND_Batch.onnx b/src/SOFIE_core/test/input_models/GatherND_Batch.onnx new file mode 100644 index 0000000..4d146c6 Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Batch.onnx differ diff --git a/src/SOFIE_core/test/input_models/GatherND_Ex1.onnx b/src/SOFIE_core/test/input_models/GatherND_Ex1.onnx new file mode 100644 index 0000000..bc1a910 Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Ex1.onnx differ diff --git a/src/SOFIE_core/test/input_models/GatherND_Ex2.onnx b/src/SOFIE_core/test/input_models/GatherND_Ex2.onnx new file mode 100644 index 0000000..4cd511c Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Ex2.onnx differ diff --git a/src/SOFIE_core/test/input_models/GatherND_Ex3.onnx b/src/SOFIE_core/test/input_models/GatherND_Ex3.onnx new file mode 100644 index 0000000..917008f Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Ex3.onnx differ diff --git a/src/SOFIE_core/test/input_models/GatherND_Ex4.onnx b/src/SOFIE_core/test/input_models/GatherND_Ex4.onnx new file mode 100644 index 0000000..d3006a2 Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Ex4.onnx differ diff --git a/src/SOFIE_core/test/input_models/GatherND_Ex5.onnx b/src/SOFIE_core/test/input_models/GatherND_Ex5.onnx new file mode 100644 index 0000000..be1ba0d Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Ex5.onnx differ diff --git a/src/SOFIE_core/test/input_models/GatherND_NegativeIndices.onnx b/src/SOFIE_core/test/input_models/GatherND_NegativeIndices.onnx new file mode 100644 index 0000000..5fa05aa Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_NegativeIndices.onnx differ diff --git a/src/SOFIE_core/test/input_models/Transpose.onnx b/src/SOFIE_core/test/input_models/Transpose.onnx new file mode 100644 index 0000000..0e08157 Binary files /dev/null and b/src/SOFIE_core/test/input_models/Transpose.onnx differ diff --git a/src/SOFIE_parsers/CMakeLists.txt b/src/SOFIE_parsers/CMakeLists.txt index 379b7d7..0e7e03d 100644 --- a/src/SOFIE_parsers/CMakeLists.txt +++ b/src/SOFIE_parsers/CMakeLists.txt @@ -61,6 +61,7 @@ set(sources_cxx src/ParseLayerNormalization.cxx src/ParseExpand.cxx src/ParseGather.cxx + src/ParseGatherND.cxx src/ParseElu.cxx src/ParseFuseConvAdd.cxx src/ParseFuseConvTransposeAdd.cxx @@ -102,6 +103,15 @@ target_include_directories(SOFIE_parsers PUBLIC set_target_properties(SOFIE_parsers PROPERTIES POSITION_INDEPENDENT_CODE TRUE) + ROOT_GENERATE_DICTIONARY(G__SOFIE_parsers ${sources_headers} + LINKDEF inc/LinkDef.h + MODULE SOFIE_parsers + OPTIONS --deep +) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers_rdict.pcm + ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers.rootmap + DESTINATION lib) + install(TARGETS SOFIE_parsers LIBRARY DESTINATION lib ) diff --git a/src/SOFIE_parsers/src/ParseGatherND.cxx b/src/SOFIE_parsers/src/ParseGatherND.cxx new file mode 100644 index 0000000..57beb01 --- /dev/null +++ b/src/SOFIE_parsers/src/ParseGatherND.cxx @@ -0,0 +1,49 @@ +#include "SOFIE/RModelParser_ONNX.hxx" +#include "SOFIE/ROperator_GatherND.hxx" +#include "onnx_proto3.pb.h" +#include + + +namespace SOFIE { + +ParserFuncSignature ParseGatherND = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) { + ETensorType input_type = ETensorType::UNDEFINED; + auto input_name = nodeproto.input(0); + if (parser.IsRegisteredTensorType(input_name)) { + input_type = parser.GetTensorType(input_name); + } else { + throw std::runtime_error("TMVA::SOFIE ONNX Parser GatherND op has input tensor " + input_name + + " but its type is not yet registered"); + } + + auto indices_name = nodeproto.input(1); + if (parser.IsRegisteredTensorType(indices_name)) { + ETensorType indices_type = parser.GetTensorType(indices_name); + if (indices_type != ETensorType::INT64) { + throw std::runtime_error("TMVA::SOFIE ONNX Parser GatherND op indices tensor must be INT64, got " + + indices_name); + } + } + + int64_t batch_dims = 0; + for (int i = 0; i < nodeproto.attribute_size(); ++i) { + const auto& attr = nodeproto.attribute(i); + if (attr.name() == "batch_dims") { + batch_dims = attr.i(); + break; + } + } + + std::string output_name = nodeproto.output(0); + + std::unique_ptr op( + new ROperator_GatherND(batch_dims, input_name, indices_name, output_name)); + + if (!parser.IsRegisteredTensorType(output_name)) { + parser.RegisterTensorType(output_name, input_type); + } + + return op; +}; + +} // namespace SOFIE diff --git a/src/SOFIE_parsers/src/ParseTile.cxx b/src/SOFIE_parsers/src/ParseTile.cxx index 20dbfb6..8b8c47f 100644 --- a/src/SOFIE_parsers/src/ParseTile.cxx +++ b/src/SOFIE_parsers/src/ParseTile.cxx @@ -29,6 +29,7 @@ ParserFuncSignature ParseTile = [](RModelParser_ONNX &parser, const onnx::NodePr switch (input_type) { case ETensorType::FLOAT: op.reset(new ROperator_Tile(repeat_name, input_name, output_name)); break; + case ETensorType::INT64: op.reset(new ROperator_Tile(repeat_name, input_name, output_name)); break; default: throw std::runtime_error("TMVA::SOFIE - Unsupported - Operator Tile does not yet support input type " + std::to_string(static_cast(input_type))); diff --git a/src/SOFIE_parsers/src/RModelParser_ONNX.cxx b/src/SOFIE_parsers/src/RModelParser_ONNX.cxx index 68662ae..5924836 100644 --- a/src/SOFIE_parsers/src/RModelParser_ONNX.cxx +++ b/src/SOFIE_parsers/src/RModelParser_ONNX.cxx @@ -73,6 +73,7 @@ extern ParserFuncSignature ParseShape; extern ParserFuncSignature ParseMatMul; extern ParserFuncSignature ParseLayerNormalization; extern ParserFuncSignature ParseGather; +extern ParserFuncSignature ParseGatherND; extern ParserFuncSignature ParseErf; extern ParserFuncSignature ParseElu; extern ParserFuncSignature ParseEyeLike; @@ -134,6 +135,7 @@ struct ExtractDataFromTP { }; template std::shared_ptr GetInitializedTensorData(onnx::TensorProto * tensorproto, size_t length) { + std::cout<<"Getting Initialized Tensor data for tensor " << tensorproto->name() << " of type " << tensorproto->data_type() << " and length " << length << std::endl; std::shared_ptr data(malloc(length * sizeof(T)), free); if (!tensorproto->raw_data().empty()) { @@ -217,6 +219,7 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un RegisterOperator("LayerNormalization", ParseLayerNormalization); RegisterOperator("Expand", ParseExpand); RegisterOperator("Gather", ParseGather); + RegisterOperator("GatherND", ParseGatherND); RegisterOperator("Erf", ParseErf); RegisterOperator("Elu", ParseElu); RegisterOperator("EyeLike", ParseEyeLike); @@ -584,6 +587,13 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto & if (verbose) std::cout << "add INT64 initialized tensor " << input_name << " shape " << ConvertShapeToString(shape) << std::endl; rmodel.AddInitializedTensor(input_name, ETensorType::INT64, shape, data); allInitializedTensors[input_name] = i; + std::cout<<"Printing initialized values for tensor: "<(data.get()); + + for (size_t i = 0; i < fLength; ++i) { + std::cout << rawData[i] << " "; + } + std::cout << std::endl; break; } default: diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt new file mode 100644 index 0000000..2ede060 --- /dev/null +++ b/src/utils/CMakeLists.txt @@ -0,0 +1,11 @@ +add_library(utils INTERFACE) + +target_include_directories(utils INTERFACE + $ + $ +) + +install( + DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/SOFIE + DESTINATION include +) diff --git a/src/utils/SOFIE/RTensor.hxx b/src/utils/SOFIE/RTensor.hxx new file mode 100644 index 0000000..db82dc9 --- /dev/null +++ b/src/utils/SOFIE/RTensor.hxx @@ -0,0 +1,628 @@ +#ifndef SOFIE_RTENSOR +#define SOFIE_RTENSOR + +#include +#include // std::size_t +#include +#include // std::runtime_error +#include // std::stringstream +#include // std::shared_ptr +#include // std::is_convertible +#include // std::reverse +#include // std::random_access_iterator_tag + +namespace SOFIE { + +/// Memory layout type +enum class MemoryLayout : uint8_t { + RowMajor = 0x01, + ColumnMajor = 0x02 +}; + +namespace Internal { + +/// \brief Get size of tensor from shape vector +/// \param[in] shape Shape vector +/// \return Size of contiguous memory +template +inline std::size_t GetSizeFromShape(const T &shape) +{ + if (shape.size() == 0) + return 0; + std::size_t size = 1; + for (auto &s : shape) + size *= s; + return size; +} + +/// \brief Compute strides from shape vector. +/// \param[in] shape Shape vector +/// \param[in] layout Memory layout +/// \return Size of contiguous memory +/// +/// This information is needed for the multi-dimensional indexing. See here: +/// https://en.wikipedia.org/wiki/Row-_and_column-major_order +/// https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.strides.html +template +inline std::vector ComputeStridesFromShape(const T &shape, MemoryLayout layout) +{ + const auto size = shape.size(); + T strides(size); + if (layout == MemoryLayout::RowMajor) { + for (std::size_t i = 0; i < size; i++) { + if (i == 0) { + strides[size - 1 - i] = 1; + } else { + strides[size - 1 - i] = strides[size - 1 - i + 1] * shape[size - 1 - i + 1]; + } + } + } else if (layout == MemoryLayout::ColumnMajor) { + for (std::size_t i = 0; i < size; i++) { + if (i == 0) { + strides[i] = 1; + } else { + strides[i] = strides[i - 1] * shape[i - 1]; + } + } + } else { + std::stringstream ss; + ss << "Memory layout type is not valid for calculating strides."; + throw std::runtime_error(ss.str()); + } + return strides; +} + +/// \brief Compute indices from global index +/// \param[in] shape Shape vector +/// \param[in] idx Global index +/// \param[in] layout Memory layout +/// \return Indice vector +template +inline T ComputeIndicesFromGlobalIndex(const T& shape, MemoryLayout layout, const typename T::value_type idx) +{ + const auto size = shape.size(); + auto strides = ComputeStridesFromShape(shape, layout); + T indices(size); + auto r = idx; + for (std::size_t i = 0; i < size; i++) { + indices[i] = int(r / strides[i]); + r = r % strides[i]; + } + return indices; +} + +/// \brief Compute global index from indices +/// \param[in] strides Strides vector +/// \param[in] idx Indice vector +/// \return Global index +template +inline std::size_t ComputeGlobalIndex(const U& strides, const V& idx) +{ + std::size_t globalIndex = 0; + const auto size = idx.size(); + for (std::size_t i = 0; i < size; i++) { + globalIndex += strides[size - 1 - i] * idx[size - 1 - i]; + } + return globalIndex; +} + +/// \brief Type checking for all types of a parameter pack, e.g., used in combination with std::is_convertible +template +struct and_types : std::true_type { +}; + +template +struct and_types : std::integral_constant()> { +}; + +/// \brief Copy slice of a tensor recursively from here to there +/// \param[in] here Source tensor +/// \param[in] there Target tensor (slice of source tensor) +/// \param[in] mins Minimum of indices for each dimension +/// \param[in] maxs Maximum of indices for each dimension +/// \param[in] idx Current indices +/// \param[in] active Active index needed to stop the recursion +/// +/// Copy the content of a slice of a tensor from source to target. This is done +/// by recursively iterating over the ranges of the slice for each dimension. +template +void RecursiveCopy(const T &here, T &there, + const std::vector &mins, const std::vector &maxs, + std::vector idx, std::size_t active) +{ + const auto size = idx.size(); + for (std::size_t i = mins[active]; i < maxs[active]; i++) { + idx[active] = i; + if (active == size - 1) { + auto idxThere = idx; + for (std::size_t j = 0; j < size; j++) { + idxThere[j] -= mins[j]; + } + there(idxThere) = here(idx); + } else { + Internal::RecursiveCopy(here, there, mins, maxs, idx, active + 1); + } + } +} + +} // namespace SOFIE::Internal + +/// \class SOFIE::RTensor +/// \brief RTensor is a container with contiguous memory and shape information. +/// \tparam T Data-type of the tensor +/// +/// An RTensor is a vector-like container, which has additional shape information. +/// The elements of the multi-dimensional container can be accessed by their +/// indices in a coherent way without taking care about the one-dimensional memory +/// layout of the contiguous storage. This also allows to manipulate the shape +/// of the container without moving the actual elements in memory. Another feature +/// is that an RTensor can own the underlying contiguous memory but can also represent +/// only a view on existing data without owning it. +template > +class RTensor { +public: + // Typedefs + using Value_t = V; + using Shape_t = std::vector; + using Index_t = Shape_t; + using Slice_t = std::vector; + using Container_t = C; + +private: + Shape_t fShape; + Shape_t fStrides; + std::size_t fSize; + MemoryLayout fLayout; + Value_t *fData; + std::shared_ptr fContainer; + +protected: + void ReshapeInplace(const Shape_t &shape); + +public: + // Constructors + + /// \brief Construct a tensor as view on data + /// \param[in] data Pointer to data contiguous in memory + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(Value_t *data, Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout), fData(data), fContainer(nullptr) + { + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + } + + /// \brief Construct a tensor as view on data + /// \param[in] data Pointer to data contiguous in memory + /// \param[in] shape Shape vector + /// \param[in] strides Strides vector + /// \param[in] layout Memory layout + RTensor(Value_t *data, Shape_t shape, Shape_t strides, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fStrides(strides), fLayout(layout), fData(data), fContainer(nullptr) + { + fSize = Internal::GetSizeFromShape(shape); + } + + /// \brief Construct a tensor owning externally provided data + /// \param[in] container Shared pointer to data container + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(std::shared_ptr container, Shape_t shape, + MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout), fContainer(container) + { + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + fData = std::data(*fContainer); + } + + /// \brief Construct a tensor owning data initialized with new container + /// \param[in] shape Shape vector + /// \param[in] layout Memory layout + RTensor(Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor) + : fShape(shape), fLayout(layout) + { + // TODO: Document how data pointer is determined using STL iterator interface. + // TODO: Sanitize given container type with type traits + fSize = Internal::GetSizeFromShape(shape); + fStrides = Internal::ComputeStridesFromShape(shape, layout); + fContainer = std::make_shared(fSize); + fData = std::data(*fContainer); + } + + // Access elements + Value_t &operator()(const Index_t &idx); + const Value_t &operator() (const Index_t &idx) const; + template Value_t &operator()(Idx... idx); + template const Value_t &operator() (Idx... idx) const; + + // Access properties + std::size_t GetSize() const { return fSize; } + const Shape_t &GetShape() const { return fShape; } + const Shape_t &GetStrides() const { return fStrides; } + Value_t *GetData() { return fData; } + const Value_t *GetData() const { return fData; } + std::shared_ptr GetContainer() { return fContainer; } + const std::shared_ptr GetContainer() const { return fContainer; } + MemoryLayout GetMemoryLayout() const { return fLayout; } + bool IsView() const { return fContainer == nullptr; } + bool IsOwner() const { return !IsView(); } + + // Copy + RTensor Copy(MemoryLayout layout = MemoryLayout::RowMajor) const; + + // Transformations + RTensor Transpose() const; + RTensor Squeeze() const; + RTensor ExpandDims(int idx) const; + RTensor Reshape(const Shape_t &shape) const; + RTensor Resize(const Shape_t &shape); + RTensor Slice(const Slice_t &slice); + + // Iterator class + class Iterator { + private: + RTensor& fTensor; + Index_t::value_type fGlobalIndex; + public: + using iterator_category = std::random_access_iterator_tag; + using value_type = Value_t; + using difference_type = std::ptrdiff_t; + using pointer = Value_t *; + using reference = Value_t &; + + Iterator(RTensor& x, typename Index_t::value_type idx) : fTensor(x), fGlobalIndex(idx) {} + Iterator& operator++() { fGlobalIndex++; return *this; } + Iterator operator++(int) { auto tmp = *this; operator++(); return tmp; } + Iterator& operator--() { fGlobalIndex--; return *this; } + Iterator operator--(int) { auto tmp = *this; operator--(); return tmp; } + Iterator operator+(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex + rhs); } + Iterator operator-(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex - rhs); } + difference_type operator-(const Iterator& rhs) { return fGlobalIndex - rhs.GetGlobalIndex(); } + Iterator& operator+=(difference_type rhs) { fGlobalIndex += rhs; return *this; } + Iterator& operator-=(difference_type rhs) { fGlobalIndex -= rhs; return *this; } + Value_t& operator*() + { + auto idx = Internal::ComputeIndicesFromGlobalIndex(fTensor.GetShape(), fTensor.GetMemoryLayout(), fGlobalIndex); + return fTensor(idx); + } + bool operator==(const Iterator& rhs) const + { + if (fGlobalIndex == rhs.GetGlobalIndex()) return true; + return false; + } + bool operator!=(const Iterator& rhs) const { return !operator==(rhs); }; + bool operator>(const Iterator& rhs) const { return fGlobalIndex > rhs.GetGlobalIndex(); } + bool operator<(const Iterator& rhs) const { return fGlobalIndex < rhs.GetGlobalIndex(); } + bool operator>=(const Iterator& rhs) const { return fGlobalIndex >= rhs.GetGlobalIndex(); } + bool operator<=(const Iterator& rhs) const { return fGlobalIndex <= rhs.GetGlobalIndex(); } + typename Index_t::value_type GetGlobalIndex() const { return fGlobalIndex; }; + }; + + // Iterator interface + // TODO: Document that the iterator always iterates following the physical memory layout. + Iterator begin() noexcept { + return Iterator(*this, 0); + } + Iterator end() noexcept { + return Iterator(*this, fSize); + } +}; + +/// \brief Reshape tensor in place +/// \param[in] shape Shape vector +/// Reshape tensor without changing the overall size +template +inline void RTensor::ReshapeInplace(const Shape_t &shape) +{ + const auto size = Internal::GetSizeFromShape(shape); + if (size != fSize) { + std::stringstream ss; + ss << "Cannot reshape tensor with size " << fSize << " into shape { "; + for (std::size_t i = 0; i < shape.size(); i++) { + if (i != shape.size() - 1) { + ss << shape[i] << ", "; + } else { + ss << shape[i] << " }."; + } + } + throw std::runtime_error(ss.str()); + } + + // Compute new strides from shape + auto strides = Internal::ComputeStridesFromShape(shape, fLayout); + fShape = shape; + fStrides = strides; +} + + +/// \brief Access elements +/// \param[in] idx Index vector +/// \return Reference to element +template +inline Value_t &RTensor::operator()(const Index_t &idx) +{ + const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx); + return fData[globalIndex]; +} + +/// \brief Access elements +/// \param[in] idx Index vector +/// \return Reference to element +template +inline const Value_t &RTensor::operator() (const Index_t &idx) const +{ + const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx); + return fData[globalIndex]; +} + +/// \brief Access elements +/// \param[in] idx Indices +/// \return Reference to element +template +template +Value_t &RTensor::operator()(Idx... idx) +{ + static_assert(Internal::and_types...>{}, + "Indices are not convertible to std::size_t."); + return operator()({static_cast(idx)...}); +} + +/// \brief Access elements +/// \param[in] idx Indices +/// \return Reference to element +template +template +const Value_t &RTensor::operator() (Idx... idx) const +{ + static_assert(Internal::and_types...>{}, + "Indices are not convertible to std::size_t."); + return operator()({static_cast(idx)...}); +} + +/// \brief Transpose +/// \returns New RTensor +/// The tensor is transposed by inverting the associated memory layout from row- +/// major to column-major and vice versa. Therefore, the underlying data is not +/// touched. +template +inline RTensor RTensor::Transpose() const +{ + MemoryLayout layout; + // Transpose by inverting memory layout + if (fLayout == MemoryLayout::RowMajor) { + layout = MemoryLayout::ColumnMajor; + } else if (fLayout == MemoryLayout::ColumnMajor) { + layout = MemoryLayout::RowMajor; + } else { + throw std::runtime_error("Memory layout is not known."); + } + + // Create copy of container + RTensor x(fData, fShape, fStrides, layout); + + // Reverse shape + std::reverse(x.fShape.begin(), x.fShape.end()); + + // Reverse strides + std::reverse(x.fStrides.begin(), x.fStrides.end()); + + return x; +} + +/// \brief Squeeze dimensions +/// \returns New RTensor +/// Squeeze removes the dimensions of size one from the shape. +template +inline RTensor RTensor::Squeeze() const +{ + // Remove dimensions of one and associated strides + Shape_t shape; + Shape_t strides; + for (std::size_t i = 0; i < fShape.size(); i++) { + if (fShape[i] != 1) { + shape.emplace_back(fShape[i]); + strides.emplace_back(fStrides[i]); + } + } + + // If all dimensions are 1, we need to keep one. + // This does not apply if the inital shape is already empty. Then, return + // the empty shape. + if (shape.size() == 0 && fShape.size() != 0) { + shape.emplace_back(1); + strides.emplace_back(1); + } + + // Create copy, attach new shape and strides and return + RTensor x(*this); + x.fShape = shape; + x.fStrides = strides; + return x; +} + +/// \brief Expand dimensions +/// \param[in] idx Index in shape vector where dimension is added +/// \returns New RTensor +/// Inserts a dimension of one into the shape. +template +inline RTensor RTensor::ExpandDims(int idx) const +{ + // Compose shape vector with additional dimensions and adjust strides + const int len = fShape.size(); + auto shape = fShape; + auto strides = fStrides; + if (idx < 0) { + idx = len + 1 + idx; + } + if (idx < 0) { + throw std::runtime_error("Given negative index is invalid."); + } + else if (idx > len) { + throw std::runtime_error("Given index is invalid."); + } + shape.insert(shape.begin() + idx, 1); + strides = Internal::ComputeStridesFromShape(shape, fLayout); + + // Create view copy, attach new shape and strides and return + RTensor x(*this); + x.fShape = shape; + x.fStrides = strides; + return x; +} + +/// \brief Reshape tensor +/// \param[in] shape Shape vector +/// \returns New RTensor +/// Reshape tensor without changing the overall size +template +inline RTensor RTensor::Reshape(const Shape_t &shape) const +{ + // Create copy, replace and return + RTensor x(*this); + x.ReshapeInplace(shape); + return x; +} + +/// \brief Resize tensor +/// \param[in] shape Shape vector +/// \returns New RTensor +/// Resize tensor into new shape +template +inline RTensor RTensor::Resize(const Shape_t &shape) +{ + // Create new tensor with the specified shape + RTensor x(shape, fLayout); + + // Copying contents from previous tensor + size_t n = (x.GetSize()>fSize) ? fSize : x.GetSize(); + std::copy(this->GetData(), this->GetData() + n, x.GetData() ); + + return x; +} + +/// \brief Create a slice of the tensor +/// \param[in] slice Slice vector +/// \returns New RTensor +/// A slice is a subset of the tensor defined by a vector of pairs of indices. +template +inline RTensor RTensor::Slice(const Slice_t &slice) +{ + // Sanitize size of slice + const auto sliceSize = slice.size(); + const auto shapeSize = fShape.size(); + if (sliceSize != shapeSize) { + std::stringstream ss; + ss << "Size of slice (" << sliceSize << ") is unequal number of dimensions (" << shapeSize << ")."; + throw std::runtime_error(ss.str()); + } + + // Sanitize slice indices + // TODO: Sanitize slice indices + /* + for (std::size_t i = 0; i < sliceSize; i++) { + } + */ + + // Convert -1 in slice to proper pair of indices + // TODO + + // Recompute shape and size + Shape_t shape(sliceSize); + for (std::size_t i = 0; i < sliceSize; i++) { + shape[i] = slice[i][1] - slice[i][0]; + } + auto size = Internal::GetSizeFromShape(shape); + + // Determine first element contributing to the slice and get the data pointer + Value_t *data; + Shape_t idx(sliceSize); + for (std::size_t i = 0; i < sliceSize; i++) { + idx[i] = slice[i][0]; + } + data = &operator()(idx); + + // Create copy and modify properties + RTensor x(*this); + x.fData = data; + x.fShape = shape; + x.fSize = size; + + // Squeeze tensor and return + return x.Squeeze(); +} + +/// Copy RTensor to new object +/// \param[in] layout Memory layout of the new RTensor +/// \returns New RTensor +/// The operation copies all elements of the current RTensor to a new RTensor +/// with the given layout contiguous in memory. Note that this copies by default +/// to a row major memory layout. +template +inline RTensor RTensor::Copy(MemoryLayout layout) const +{ + // Create new tensor with zeros owning the memory + RTensor r(fShape, layout); + + // Copy over the elements from this tensor + const auto mins = Shape_t(fShape.size()); + const auto maxs = fShape; + auto idx = mins; + Internal::RecursiveCopy(*this, r, mins, maxs, idx, 0); + + return r; +} + +/// \brief Pretty printing +/// \param[in] os Output stream +/// \param[in] x RTensor +/// \return Modified output stream +template +std::ostream &operator<<(std::ostream &os, RTensor &x) +{ + const auto shapeSize = x.GetShape().size(); + if (shapeSize == 1) { + os << "{ "; + const auto size = x.GetSize(); + for (std::size_t i = 0; i < size; i++) { + os << x({i}); + if (i != size - 1) + os << ", "; + } + os << " }"; + } else if (shapeSize == 2) { + os << "{"; + const auto shape = x.GetShape(); + for (std::size_t i = 0; i < shape[0]; i++) { + os << " { "; + for (std::size_t j = 0; j < shape[1]; j++) { + os << x({i, j}); + if (j < shape[1] - 1) { + os << ", "; + } else { + os << " "; + } + } + os << "}"; + } + os << " }"; + } else { + os << "{ printing not yet implemented for this rank }"; + } + return os; +} + +} // namespace SOFIE + +namespace cling { +template +std::string printValue(SOFIE::RTensor *x) +{ + std::stringstream ss; + ss << *x; + return ss.str(); +} +} // namespace cling + +#endif // SOFIE_RTENSOR