diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..182ccd4
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,81 @@
+{
+    "files.associations": {
+        "*.icc": "cpp",
+        "limits": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "*.tcc": "cpp",
+        "bitset": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "cstdint": "cpp",
+        "deque": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "regex": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "iostream": "cpp",
+        "istream": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "ostream": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "cinttypes": "cpp",
+        "typeinfo": "cpp",
+        "charconv": "cpp",
+        "chrono": "cpp",
+        "condition_variable": "cpp",
+        "list": "cpp",
+        "ratio": "cpp",
+        "future": "cpp",
+        "mutex": "cpp",
+        "semaphore": "cpp",
+        "shared_mutex": "cpp",
+        "span": "cpp",
+        "stop_token": "cpp",
+        "thread": "cpp",
+        "cfenv": "cpp",
+        "variant": "cpp",
+        "format": "cpp",
+        "any": "cpp",
+        "source_location": "cpp",
+        "run_inference_particle_net.C": "cpp",
+        "test.C": "cpp"
+    }
+}
diff --git a/README.md b/README.md
index 97902f8..597cb56 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,11 @@ source setup.sh
 ```
 Now ROOT should also access the SOFIE libraries while it runs. This helps to accelerate development. Submit your developments here and we will proceed with the developments in ROOT carefull.
 
-
+3. To enable testing generated code with alpaka implementations, build using the following command:
+```bash
+cmake -Dtesting=ON -DENABLE_ALPAKA_TESTS=ON -DCMAKE_INSTALL_PREFIX=../install -DCMAKE_BUILD_TYPE=RelWithDebInfo ..
+```
+The default architecture is CUDA, but can be configured using an additional`-DALPAKA_BACKEND=hip` cmake option.
     
 ## Inspiration
 The standalone version of SOFIE is developed with inspiration from the standalone version of RooFit developed by Jonas Rembser that can be found [here](https://github.com/guitargeek/roofit).
diff --git a/src/.vscode/settings.json b/src/.vscode/settings.json
new file mode 100644
index 0000000..8bc121a
--- /dev/null
+++ b/src/.vscode/settings.json
@@ -0,0 +1,61 @@
+{
+    "files.associations": {
+        "*.icc": "cpp",
+        "iostream": "cpp",
+        "ostream": "cpp",
+        "cctype": "cpp",
+        "clocale": "cpp",
+        "cmath": "cpp",
+        "cstdarg": "cpp",
+        "cstddef": "cpp",
+        "cstdio": "cpp",
+        "cstdlib": "cpp",
+        "cstring": "cpp",
+        "ctime": "cpp",
+        "cwchar": "cpp",
+        "cwctype": "cpp",
+        "array": "cpp",
+        "atomic": "cpp",
+        "bit": "cpp",
+        "bitset": "cpp",
+        "compare": "cpp",
+        "complex": "cpp",
+        "concepts": "cpp",
+        "cstdint": "cpp",
+        "deque": "cpp",
+        "map": "cpp",
+        "set": "cpp",
+        "string": "cpp",
+        "unordered_map": "cpp",
+        "unordered_set": "cpp",
+        "vector": "cpp",
+        "exception": "cpp",
+        "algorithm": "cpp",
+        "functional": "cpp",
+        "iterator": "cpp",
+        "memory": "cpp",
+        "memory_resource": "cpp",
+        "numeric": "cpp",
+        "optional": "cpp",
+        "random": "cpp",
+        "regex": "cpp",
+        "string_view": "cpp",
+        "system_error": "cpp",
+        "tuple": "cpp",
+        "type_traits": "cpp",
+        "utility": "cpp",
+        "fstream": "cpp",
+        "initializer_list": "cpp",
+        "iomanip": "cpp",
+        "iosfwd": "cpp",
+        "istream": "cpp",
+        "limits": "cpp",
+        "new": "cpp",
+        "numbers": "cpp",
+        "sstream": "cpp",
+        "stdexcept": "cpp",
+        "streambuf": "cpp",
+        "cinttypes": "cpp",
+        "typeinfo": "cpp"
+    }
+}
\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index c48e8d1..102ca3b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,3 +8,4 @@ set(sofie_legacy_eval_backend ON CACHE BOOL "" FORCE)
 
 add_subdirectory(SOFIE_core)
 add_subdirectory(SOFIE_parsers)
+add_subdirectory(utils)
diff --git a/src/SOFIE_core/CMakeLists.txt b/src/SOFIE_core/CMakeLists.txt
index 7297957..4cab8e0 100644
--- a/src/SOFIE_core/CMakeLists.txt
+++ b/src/SOFIE_core/CMakeLists.txt
@@ -76,6 +76,7 @@ list(TRANSFORM sources_headers PREPEND "inc/")
 set(sources_cxx
     src/RModel_Base.cxx
     src/RModel.cxx
+    src/RModel_ALPAKA.cxx
     src/RModel_GNN.cxx
     src/RModel_GraphIndependent.cxx
     src/RFunction.cxx
@@ -87,18 +88,24 @@ set(sources_cxx
 
 target_sources(SOFIE_core PRIVATE ${sources_headers} ${sources_cxx})
 target_include_directories(SOFIE_core PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/inc)
+target_link_libraries(SOFIE_core PUBLIC utils)
 target_link_libraries(SOFIE_core PUBLIC
     Tree
     Core
     RIO
 )
 
-ROOT_GENERATE_DICTIONARY(G__SOFIE ${sources_headers}
+ROOT_GENERATE_DICTIONARY(G__SOFIE_core ${sources_headers}
     LINKDEF inc/LinkDef.h
     MODULE SOFIE_core
     OPTIONS --deep
 )
 
+# Install the dictionaries.
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core_rdict.pcm
+              ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_core.rootmap
+        DESTINATION lib)
+
 install(TARGETS SOFIE_core
         LIBRARY DESTINATION lib
 )
diff --git a/src/SOFIE_core/README.md b/src/SOFIE_core/README.md
index 033cad4..2259d7a 100644
--- a/src/SOFIE_core/README.md
+++ b/src/SOFIE_core/README.md
@@ -25,7 +25,6 @@ SOFIE works in a parser-generator working architecture. With SOFIE, the user get
 From ROOT command line, or in a ROOT macro, we can proceed with an ONNX model:
 
 ```c++
-using namespace TMVA::Experimental;
 SOFIE::RModelParser_ONNX parser;
 SOFIE::RModel model = parser.Parse(“./example_model.onnx”);
 model.Generate();
@@ -73,7 +72,6 @@ SOFIE also supports generating inference code with RDataFrame as inputs, refer t
 
 Here is the updated list of supported ONNX operators. You can obtain this list by doing
 ```cpp
-using namespace TMVA::Experimental;
 SOFIE::RModelParser_ONNX parser;
 std::vector<std::string> supportedOperators = parser.GetRegisteredOperators();
 ```
@@ -164,7 +162,6 @@ The above operators are supported for tensors of the following types:
 
 You can also check your model whether all operators are implemented by doing the following:
 ```c++
-using namespace TMVA::Experimental;
 SOFIE::RModelParser_ONNX parser;
 parser.CheckModel("example_model.ONNX");
 ```
diff --git a/src/SOFIE_core/inc/SOFIE/RFunction.hxx b/src/SOFIE_core/inc/SOFIE/RFunction.hxx
index 53c30e3..f79691a 100644
--- a/src/SOFIE_core/inc/SOFIE/RFunction.hxx
+++ b/src/SOFIE_core/inc/SOFIE/RFunction.hxx
@@ -3,6 +3,7 @@
 
 #include "SOFIE/RModel_Base.hxx"
 #include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
 
 #include <memory>
 #include <string>
diff --git a/src/SOFIE_core/inc/SOFIE/RModel.hxx b/src/SOFIE_core/inc/SOFIE/RModel.hxx
index 79541af..50fc231 100644
--- a/src/SOFIE_core/inc/SOFIE/RModel.hxx
+++ b/src/SOFIE_core/inc/SOFIE/RModel.hxx
@@ -16,14 +16,21 @@ private:
    int fVerbose = 0;
    int fBatchSize = -1;
    long fReadPos = 0;  // reading file position
+   size_t fConstantTensorSize = 0; // size  (in Bytes) of the allocated constant tensors
+   size_t fWeightsTensorSize = 0;  // size  (in Bytes) of the allocated weight tensors
+   size_t fOtherTensorSize = 0;    // size  (in Bytes) of intermediate tensors which are not managed by the memory pool
+
+   OptimizationLevel fOptimizationLevel = OptimizationLevel::kExtended;
 
    std::unordered_map<std::string, InputTensorInfo> fInputTensorInfos; // input tensors where shape may not fully defined or other graph inputs?
    std::unordered_map<std::string, TensorInfo> fReadyInputTensorInfos; // input tensors where shape is full defined
    std::unordered_map<std::string, InitializedTensor> fInitializedTensors;
    std::unordered_map<std::string, TensorInfo> fIntermediateTensorInfos;
    std::unordered_map<std::string, DynamicTensorInfo> fDynamicTensorInfos;
+   std::unordered_map<std::string, std::pair<std::vector<Dim>, bool>> fShapeTensors; // constant tensors describing a shape
    std::unordered_map<std::string, std::string>
       fShapeParams; // parameters defining the dynamic shape (e.g. batch size), store also its default value
+   std::vector<std::string> fDimShapeNames; // parameter names used to define the shapes
    std::vector<std::string> fOutputTensorNames;
    std::vector<std::string> fInputTensorNames; // input tensor names using ONNX order
 
@@ -58,9 +65,14 @@ public:
 
    int Verbose() const { return fVerbose;}
 
-   const std::vector<size_t> &GetTensorShape(std::string name) const;
-   std::vector<Dim> GetDynamicTensorShape(std::string name) const;
-   const ETensorType &GetTensorType(std::string name) const;
+   const std::vector<size_t> &GetTensorShape(const std::string & name) const;
+   std::vector<Dim> GetDimTensorShape(const std::string & name) const;
+   const ETensorType &GetTensorType(const std::string & name) const;
+   std::vector<Dim> GetDynamicTensorShape(const std::string & name) const ;
+
+   // get the values for the tensor representing a shape
+   const std::vector<Dim> & GetShapeTensorValues(const std::string & tensor_name) const;
+
 
    bool CheckIfTensorAlreadyExist(std::string tensor_name);
    void AddInputTensorInfo(std::string input_name, ETensorType type, std::vector<Dim> shape);
@@ -81,6 +93,7 @@ public:
       size_t length = ConvertShapeToLength(shape);
       std::shared_ptr<void> data_ptr(malloc(length * sizeof(T)), free);
       std::memcpy(data_ptr.get(), (void*) data, length * sizeof(T));
+      std::cout<<"Length of constant tensor "<<name<<" added: "<<length<<std::endl;
       AddConstantTensor(name, GetTemplatedType<T>(T()), shape, data_ptr);
    }
    // for boolean can be more convenient passing an std::vector
@@ -102,6 +115,8 @@ public:
       AddInitializedTensor(tensor_name,  GetTemplatedType(T()), shape, data);
    }
 
+   void AddShapeTensor(const std::string & name, const std::vector<Dim> & shapeValues, bool scalar = false);
+
    // add and initialize subgraph to the model
    void InitializeSubGraph(std::shared_ptr<RModel>  graph);
 
@@ -118,13 +133,15 @@ public:
    bool IsDimInputTensor(const std::string &name) const;
    // check if tensor is a fully specified input tensor
    bool IsReadyInputTensor(const std::string &name) const;
+   /// check if a tensor is a shape tensor
+   bool IsShapeTensor(const std::string & name) const;
 
    // Add intermediate tensor
    void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<Dim> dim_shape);
    void AddIntermediateTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape);
    // Add an intermediate dynamic tensor
    void AddDynamicTensor(std::string tensor_name, ETensorType type, std::vector<Dim> shape);
-
+   void AddShapeParam(const std::string & name, size_t def_value = 0);
    void AddInputTensorName(std::string name);
    void AddOutputTensorNameList(std::vector<std::string> output_tensor_names);
    void
@@ -132,6 +149,9 @@ public:
    void UpdateInitializedTensor(std::string tensor_name, ETensorType type, std::vector<std::size_t> shape,
                                 std::shared_ptr<void> data);
    std::shared_ptr<void> GetInitializedTensorData(std::string tensor_name);
+   void RemoveInitializedTensor(std::string tensor_name);
+   template<class T>
+   std::vector<T> GetTensorData(const std::string & name);
 
    void Initialize(int batchSize = -1, bool verbose = false);
    void Initialize(const std::map<std::string,size_t> & inputParams, bool verbose = false);
@@ -141,34 +161,64 @@ public:
    {
       Generate(static_cast<std::underlying_type_t<Options>>(options), batchSize, pos, verbose);
    }
+   void GenerateGPU_ALPAKA(std::underlying_type_t<Options> options, int batchSize = -1, bool verbose = false);
+   void GenerateGPU_ALPAKA(Options options = Options::kDefault, int batchSize = -1, bool verbose = false)
+   {
+      GenerateGPU_ALPAKA(static_cast<std::underlying_type_t<Options>>(options), batchSize, verbose);
+   }
    // generate the infer function signature. If isdecl= false generate the calling infer function
    // used to infer the sub-graphs
    std::string GenerateInferSignature(bool isdecl = true);
 
+   // generate the infer function signature for inference on ALPAKA. If isdecl= false generate the calling infer function
+   // used to infer the sub-graphs
+   std::string GenerateInferSignature_GPU_ALPAKA(bool isdecl = true);
+
+   void RemoveIntermediateTensor(const std::string& tensor_name){
+      fIntermediateTensorInfos.erase(tensor_name);
+   }
+
    // calculate total intermediate memory and position intermediate tensor addresses
-   std::string AllocateIntermediateMemory(std::span<const std::string_view> op_output_tensors);
-   void CheckAndFlushIntermediateMemory(std::span<const std::string_view> op_output_tensors, const size_t& op_idx);
+   std::string AllocateIntermediateMemory(std::span<const std::string> op_output_tensors);
+   void CheckAndFlushIntermediateMemory(std::span<const std::string> op_output_tensors, const size_t& op_idx);
 
 protected:
    // internal functions
    // generate code for the initialized tensors
    void GenerateInitializedTensorInfo();
+
+   void GenerateInitializedTensorInfo_GPU_ALPAKA(); 
    // generate code for the intermediate tensors
    void GenerateIntermediateTensorInfo();
+
+   // generate code for the temporary initialized tensors containers
+   void GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA();
+
    // generate code for the dynamic tensors
    void GenerateDynamicTensorInfo();
+
+   void GenerateDynamicTensorInfo_GPU_ALPAKA();
    // generate code for declarations needed by operators
    void GenerateOperatorDeclarations();
    // generate code for inference
    void GenerateOutput();
+
+   void GenerateOutput_GPU_ALPAKA();
+
+   void MoveInitializedTensorsToBuffers_ALPAKA();
    // generate code for initializing memory pool for intermediate tensors
    void GenerateIntermediateMemoryPool();
    // Generate all session code
    void GenerateSessionCode();
+   void GenerateSessionCode_GPU_ALPAKA();
+   void GenerateGPU_ALPAKA_Buffers();
+
+   void CheckAndFuseOperators();
 
 public:
    const std::vector<std::string> &GetInputTensorNames() const { return fInputTensorNames; }
    const std::vector<std::string> &GetOutputTensorNames() const { return fOutputTensorNames; }
+   const std::vector<std::string> & GetDimShapeNames() const { return fDimShapeNames; }
 
    void ReadInitializedTensorsFromFile(long);
    long WriteInitializedTensorsToFile(std::string filename = "");
@@ -203,6 +253,21 @@ public:
    ClassDefNV(RModel, 3);
 };
 
+template<class T>
+inline std::vector<T> RModel::GetTensorData(const std::string & name) {
+   if (!IsInitializedTensor(name)) return std::vector<T>{};
+   T * data = static_cast<T*>(GetInitializedTensorData(name).get());
+   size_t size = ConvertShapeToLength(GetTensorShape(name));
+   return std::vector<T>(data, data+size);
+}
+
+template<>
+inline std::vector<Dim> RModel::GetTensorData<Dim>(const std::string & name) {
+   if (!IsShapeTensor(name)) return std::vector<Dim>{};
+   return GetShapeTensorValues(name);
+}
+
+
 } // namespace SOFIE
 
 #endif // SOFIE_RMODEL
diff --git a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx b/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx
index f8a9d34..601e3a9 100644
--- a/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx
+++ b/src/SOFIE_core/inc/SOFIE/RModel_Base.hxx
@@ -12,7 +12,6 @@
 #include <fstream>
 #include <sstream>
 #include "SOFIE/SOFIE_common.hxx"
-#include "SOFIE/ROperator.hxx"
 #include "TBuffer.h"
 
 
@@ -27,10 +26,26 @@ enum class Options {
    kGNNComponent = 0x10,
 };
 
+// Optimization levels inspired by ONNXRuntime.
+// We only get Operator Fusion with the Basic, and
+// memory reuse with Extended. kExtended is enabled
+// by default
+enum class OptimizationLevel {
+   kBasic = 0x0,
+   kExtended = 0x1,
+};
+
 enum class WeightFileType { None, RootBinary, Text };
 
-std::underlying_type_t<Options> operator|(Options opA, Options opB);
-std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, Options opB);
+
+inline std::underlying_type_t<Options> operator|(Options opA, Options opB) {
+    return static_cast<std::underlying_type_t<Options>>(opA) |
+           static_cast<std::underlying_type_t<Options>>(opB);
+}
+
+inline std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, Options opB) {
+    return opA | static_cast<std::underlying_type_t<Options>>(opB);
+}
 
 class RModel_Base {
 
@@ -53,6 +68,46 @@ protected:
    bool fIsGNN = false;
    bool fIsGNNComponent = false;
 
+   // Function to generate the code for declaring and initializing constant tensors
+   // This is for tensors which are not part of weight files and can be created from the Constant operator
+   template <typename T>
+   std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedTensor> &t)
+   {
+      std::stringstream strs;
+      std::string type = ConvertTypeToString(t.second.type());
+      size_t length = ConvertShapeToLength(t.second.shape());
+      std::cout<<"Constant tensor name: "<<t.first<<", Constant tensor length: "<<length<<"\n";
+      // avoid using stack sizes for constant tensors to reduce compilation time
+      bool allocateOnStack = (length > 100) ? false : true;
+
+      const T *data = t.second.data<T>();
+
+      // and check if all values are the same
+      bool sameData = false;
+      // for non stack allocation check if data are the same
+      if (!allocateOnStack && length > 1) {
+         size_t idx = 1;
+         std::cout<<"insider allocate on stack and length\n";
+         do {
+            std::cout<<"Printing idx: "<<idx<<", data[idx] = "<<data[idx]<<", data[idx-1] = "<<data[idx-1]<<std::endl;
+            sameData = (data[idx] == data[idx - 1]);
+            idx++;
+         } while (sameData && idx < length);
+      }
+      if (allocateOnStack) {
+         strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
+      } else {
+         strs << "std::vector<" << type << "> fTensor_" << t.first << " = ";
+         if (sameData)
+            strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n";
+         else {
+            strs << ConvertValuesToString(length, data) << ";\n";
+         }
+         strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
+      }
+      return strs.str();
+   }
+
 public:
    /**
        Default constructor. Needed to allow serialization of ROOT objects. See
@@ -82,6 +137,7 @@ public:
        fCustomOpHeaders.insert(filename);
    }
    void GenerateHeaderInfo(std::string &hgname);
+   void GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname);
    void PrintGenerated() { std::cout << fGC; }
 
    std::string ReturnGenerated() { return fGC; }
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator.hxx b/src/SOFIE_core/inc/SOFIE/ROperator.hxx
index edbec58..6c9a812 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator.hxx
@@ -2,63 +2,113 @@
 #define SOFIE_ROPERATOR
 
 #include <vector>
+#include <set>
 #include <memory>
 
 #include "SOFIE/SOFIE_common.hxx"
-//#include "RModel.hxx"
-
-
 
 
 namespace SOFIE{
 
 class RModel;
 
+enum class OperatorKind {
+   GEMM = 0,
+   LAYERNORM = 1,
+   RELU = 2,
+   CONSTANT = 3,
+   CONSTANTOFSHAPE = 4,
+   UNDEFINED = 5,
+   CONV=6,
+   BATCHNORM=7,
+   CAST=8,
+   COMPARISON=9,
+   EINSUM=10,
+   ELU=11,
+   SIGMOID=12,
+   TANH=13,
+   SOFTMAX=14,
+   LEAKYRELU=15,
+};
+
+inline const char* toString(OperatorKind kind) {
+   switch (kind) {
+       case OperatorKind::GEMM:       return "GEMM";
+       case OperatorKind::LAYERNORM:  return "LAYERNORM";
+       case OperatorKind::RELU:       return "RELU";
+       case OperatorKind::CONSTANT:        return "CONSTANT";
+       case OperatorKind::CONSTANTOFSHAPE: return "CONSTANTOFSHAPE";
+       case OperatorKind::BATCHNORM:       return "BATCHNORM";  
+       case OperatorKind::CONV:       return "CONV";
+       case OperatorKind::UNDEFINED:  return "UNDEFINED";
+       default:                       return "UNKNOWN";
+   }
+}
+
+inline std::set<OperatorKind> FusableKinds = { OperatorKind::RELU, OperatorKind::LAYERNORM, OperatorKind::BATCHNORM};
+
 class ROperator{
 
 
 public:
    virtual std::vector<std::string> GetBlasRoutines() { return {}; }
    virtual std::vector<std::string> GetStdLibs() { return {}; }
-   virtual std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>>) = 0;
-   virtual std::vector<ETensorType> TypeInference(std::vector<ETensorType>) = 0;
+   virtual std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>>) { return {}; };
+   virtual std::vector<ETensorType> TypeInference(std::vector<ETensorType>) { return {}; };
    virtual void Initialize(RModel&) = 0;
    virtual std::string Generate(std::string OpName) = 0;  //expect unique opName for each operator within the same RModel
+   virtual std::string Generate_GPU_ALPAKA(std::string OpName){ return "";} //expect unique opName for each operator within the same RModel
    // generate initialization code for session constructor
    virtual std::string GenerateInitCode() { return "";}
+   virtual std::string GenerateInitCode_GPU_ALPAKA() { return "";};
    // generate some specific declaration code for Session
    virtual std::string GenerateDeclCode() { return "";}
    // generate session data members specific to operator
    virtual std::string GenerateSessionMembersCode(std::string /*opName*/) { return ""; }
+   virtual std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) { return ""; }
+   virtual std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) { return ""; }
    virtual std::string Header() { return "";}
+   virtual std::string GetFusableOutputTensorName() { return "";}
+   virtual std::string GetBlasConfig() { return ""; }
+   virtual void UpdateFusableTensorName(std::string, const std::function<void(const std::string&)>& removal_func){ return;};
 
    //virtual void Forward_reference() = 0;
    //virtual void Forward_blas() = 0;
    virtual ~ROperator(){}
 
 protected:
-
+   OperatorKind fKind = OperatorKind::UNDEFINED;
+   size_t fOpOrder = 0;
    const std::string SP = "   ";    ///< space used to correctly indent the generated C++ code
    bool fUseSession = false;        ///< flag to identify if using the session class
    bool fIsOutputConstant = false;  ///< flag to identify if operator has a constant output (no need to generate code)
-   
-   mutable std::vector<std::string_view> fInputTensorNames;
-   mutable std::vector<std::string_view> fOutputTensorNames;
+   bool fIsOutputParamShape = false;     ///< flag to identify of the output represents a parametric shape (can be knwon at compile time)
+
+   mutable std::vector<std::string> fInputTensorNames;
+   mutable std::vector<std::string> fOutputTensorNames;
 
 public:
-   std::span<const std::string_view> GetOpInputTensors() const {
+   std::span<const std::string> GetOpInputTensors() const {
       return fInputTensorNames;
    }
 
-   std::span<const std::string_view> GetOpOutputTensors() const {
+   std::span<const std::string> GetOpOutputTensors() const {
       return fOutputTensorNames;
    }
-   
+
+   OperatorKind GetKind() const { return fKind; }
+
+   void RegisterOperatorOrder(const size_t ord){
+      fOpOrder = ord;
+   }
+   size_t GetOpOrder(){
+      return fOpOrder;
+   }
+
 };
 
 
 
 }//SOFIE
 
-
 #endif //SOFIE_OPERATOR
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx
index 127eaff..85953d5 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_BasicBinary.hxx
@@ -1,5 +1,5 @@
-#ifndef SOFIE_ROperator_BasicBinary
-#define SOFIE_ROperator_BasicBinary
+#ifndef SOFIE_SOFIE_ROperator_BasicBinary
+#define SOFIE_SOFIE_ROperator_BasicBinary
 
 #include "SOFIE/SOFIE_common.hxx"
 #include "SOFIE/ROperator.hxx"
@@ -7,9 +7,15 @@
 
 #include <sstream>
 
-namespace SOFIE{
+namespace SOFIE {
 
-enum EBasicBinaryOperator { Add, Sub, Mul, Div, Pow };
+enum EBasicBinaryOperator {
+   Add,
+   Sub,
+   Mul,
+   Div,
+   Pow
+};
 
 template <typename T, EBasicBinaryOperator Op1>
 struct BinaryOperatorTrait {};
@@ -17,42 +23,42 @@ struct BinaryOperatorTrait {};
 template <typename T>
 struct BinaryOperatorTrait<T, Add> {
    static const std::string Name() { return "Add"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " + " + t2; }
-   static T Func(T t1, T t2) {return  t1 + t2;}
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " + " + t2; }
+   static T Func(T t1, T t2) { return t1 + t2; }
 };
 
 template <typename T>
 struct BinaryOperatorTrait<T, Sub> {
    static const std::string Name() { return "Sub"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " - " + t2; }
-   static T Func (T t1, T t2) { return t1 - t2;}
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " - " + t2; }
+   static T Func(T t1, T t2) { return t1 - t2; }
 };
 
 template <typename T>
 struct BinaryOperatorTrait<T, Mul> {
    static const std::string Name() { return "Mul"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " * " + t2; }
-   static T Func (T t1, T t2) { return  t1 * t2;}
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " * " + t2; }
+   static T Func(T t1, T t2) { return t1 * t2; }
 };
 
 template <typename T>
 struct BinaryOperatorTrait<T, Div> {
    static const std::string Name() { return "Div"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return t1 + " / " + t2; }
-   static T Func (T t1, T t2) { return t1/t2;}
+   static std::string Op(const std::string &t1, const std::string t2) { return t1 + " / " + t2; }
+   static T Func(T t1, T t2) { return t1 / t2; }
 };
 
 template <typename T>
 struct BinaryOperatorTrait<T, Pow> {
    static const std::string Name() { return "Pow"; }
-   static std::string Op(const std::string & t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; }
-   static T Func (T t1, T t2) { return std::pow(t1,t2);}
+   static std::string Op(const std::string &t1, const std::string t2) { return "std::pow(" + t1 + "," + t2 + ")"; }
+   static T Func(T t1, T t2) { return std::pow(t1, t2); }
 };
 
-template<typename T, EBasicBinaryOperator Op>
-class ROperator_BasicBinary final : public ROperator{
+template <typename T, EBasicBinaryOperator Op>
+class ROperator_BasicBinary final : public ROperator {
 private:
-
+   int fBroadcastFlag = 0;
    std::string fNA;
    std::string fNB;
    std::string fNBroadcastedA;
@@ -63,154 +69,444 @@ private:
    std::vector<size_t> fShapeB;
    std::vector<size_t> fShapeY;
 
+   std::vector<Dim> fDimShapeA;
+   std::vector<Dim> fDimShapeB;
+   std::vector<Dim> fDimShapeY;
+
 public:
-   ROperator_BasicBinary(){}
-   ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY):
-      fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY)){
-         fInputTensorNames = { fNA, fNB };
-         fOutputTensorNames = { fNY };
-      }
+   ROperator_BasicBinary() {}
+   ROperator_BasicBinary(std::string nameA, std::string nameB, std::string nameY)
+      : fNA(UTILITY::Clean_name(nameA)), fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
+   {
+      fInputTensorNames = {fNA, fNB};
+      fOutputTensorNames = {fNY};
+   }
 
    // type of output given input
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override { return input; }
 
    // shape of output tensors given input tensors
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override
+   {
       // assume now inputs have same shape (no broadcasting)
       auto ret = std::vector<std::vector<size_t>>(1, input[0]); // return vector size 1 with first input
       return ret;
    }
 
-   void Initialize(RModel& model) override {
+   void Initialize(RModel &model) override
+   {
       // input must be a graph input, or already initialized intermediate tensor
-      if (!model.CheckIfTensorAlreadyExist(fNA)){
+      if (!model.CheckIfTensorAlreadyExist(fNA)) {
          throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNA + "is not found in model");
       }
       if (!model.CheckIfTensorAlreadyExist(fNB)) {
          throw std::runtime_error(std::string("TMVA SOFIE Binary Op Input Tensor ") + fNB + "is not found in model");
       }
-      fShapeA = model.GetTensorShape(fNA);
-      fShapeB = model.GetTensorShape(fNB);
-      bool broadcast = !UTILITY::AreSameShape(fShapeA, fShapeB);
-      if (broadcast) {
-         // Y is the common shape of A and B
-         fShapeY = UTILITY::UnidirectionalBroadcastShape(fShapeA, fShapeB);
-         bool broadcastA = !UTILITY::AreSameShape(fShapeA, fShapeY);
-         bool broadcastB = !UTILITY::AreSameShape(fShapeB, fShapeY);
-         // Broadcast A to Y
-         if (broadcastA) {
-            fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY;
-            if (model.IsInitializedTensor(fNA)) {
-               auto data = model.GetInitializedTensorData(fNA);
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeA, fShapeY),
-                  std::default_delete<T[]>());
-               // Update the data and the shape of A
-               model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData);
-               fShapeA = fShapeY;
+      int dynamicInputs = 0;
+      if (model.IsDynamicTensor(fNA)) {
+         fDimShapeA = model.GetDimTensorShape(fNA);
+         dynamicInputs |= 1;
+      } else {
+         fShapeA = model.GetTensorShape(fNA);
+         fDimShapeA = ConvertShapeToDim(fShapeA);
+      }
+      if (model.IsDynamicTensor(fNB)) {
+         dynamicInputs |= 2;
+         fDimShapeB = model.GetDimTensorShape(fNB);
+      } else {
+         fShapeB = model.GetTensorShape(fNB);
+         fDimShapeB = ConvertShapeToDim(fShapeB);
+      }
+      if (dynamicInputs & 1 && model.Verbose())
+         std::cout << BinaryOperatorTrait<T, Op>::Name() << " : input " << fNA << " is dynamic "
+                   << ConvertDimShapeToString(fDimShapeA) << "  ";
+      if (dynamicInputs & 2 && model.Verbose())
+         std::cout << BinaryOperatorTrait<T, Op>::Name() << " : input " << fNB << " is dynamic "
+                   << ConvertDimShapeToString(fDimShapeB) << "  ";
+      std::cout << std::endl;
+      // check if need to broadcast at initialization time if shapes are known and different
+      // (we could broadcast the tensor tensor to maximum values of dynamic shapes - to be done)
+      // case of known shapes
+      // if shapes are known find the output shape from broadcasting
+      if (dynamicInputs == 0) {
+         auto ret = UTILITY::MultidirectionalBroadcastShape(fShapeA, fShapeB);
+         fBroadcastFlag = ret.first;
+         fShapeY = ret.second;
+         if (model.IsConstantTensor(fNA) && model.IsConstantTensor(fNB)) {
+            bool broadcast = fBroadcastFlag > 0;
+            if (broadcast) {
+               // Y is the common shape of A and B
+               bool broadcastA = fBroadcastFlag & 2;
+               bool broadcastB = fBroadcastFlag & 1;
+               // Broadcast A to Y
+               if (broadcastA) {
+                  fNBroadcastedA = "Broadcasted" + fNA + "to" + fNY;
+                  auto data = model.GetInitializedTensorData(fNA);
+                  std::shared_ptr<void> broadcastedData(
+                     UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeA, fShapeY),
+                     std::default_delete<T[]>());
+                  if (model.Verbose())
+                     std::cout << "broadcasted data A " << ConvertShapeToString(fShapeY) << " : "
+                               << ConvertValuesToString(ConvertShapeToLength(fShapeY),
+                                                        static_cast<T *>(broadcastedData.get()))
+                               << std::endl;
+                  // Update the data and the shape of A
+                  model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData);
+                  fShapeA = fShapeY;
+                  fDimShapeA = ConvertShapeToDim(fShapeA);
+               }
+               // Broadcast B to Y
+               if (broadcastB) {
+                  fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY;
+                  auto data = model.GetInitializedTensorData(fNB);
+                  if (model.Verbose())
+                     std::cout << "data B " << ConvertShapeToString(fShapeB) << " : "
+                               << ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast<T *>(data.get()))
+                               << std::endl;
+                  std::shared_ptr<void> broadcastedData(
+                     UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeB, fShapeY),
+                     std::default_delete<T[]>());
+                  // do not update tensor B but add broadcasted one (since it can be input to some other operators)
+                  if (model.Verbose())
+                     std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : "
+                               << ConvertValuesToString(ConvertShapeToLength(fShapeY),
+                                                        static_cast<T *>(broadcastedData.get()))
+                               << std::endl;
+                  model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData);
+                  fShapeB = fShapeY;
+                  fDimShapeB = ConvertShapeToDim(fShapeB);
+               }
             } else {
-               // Add an intermediate tensor for broadcasting A
-               model.AddIntermediateTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY);
+               fShapeY = fShapeA;
             }
-         }
-         // Broadcast B to Y
-         if (broadcastB) {
-            fNBroadcastedB = "Broadcasted" + fNB + "to" + fNY;
-            if (model.IsInitializedTensor(fNB)) {
-               auto data = model.GetInitializedTensorData(fNB);
-               std::cout << "data B " << ConvertShapeToString(fShapeB) << " : " <<
-                  ConvertValuesToString(ConvertShapeToLength(fShapeB), static_cast<T*>(data.get())) << std::endl;
-               std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeB, fShapeY),
-                  std::default_delete<T[]>());
-               // do not update tensor B but add broadcasted one (since it can be input to some other operators)
-               std::cout << "broadcasted data B " << ConvertShapeToString(fShapeY) << " : " <<
-                  ConvertValuesToString(ConvertShapeToLength(fShapeY), static_cast<T*>(broadcastedData.get())) << std::endl;
-               model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData);
-               fShapeB = fShapeY;
-            } else {
-               // Add an intermediate tensor for broadcasting B
-               model.AddIntermediateTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY);
+            // tensors are constant: perform here the binary operation
+
+            const std::string &nameA = fNBroadcastedA.empty() ? fNA : fNBroadcastedA;
+            const std::string &nameB = fNBroadcastedB.empty() ? fNB : fNBroadcastedB;
+            auto dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
+            auto dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
+            std::vector<T> dataY(ConvertShapeToLength(fShapeY));
+            for (size_t i = 0; i < dataY.size(); i++) {
+               dataY[i] = BinaryOperatorTrait<T, Op>::Func(dataA[i], dataB[i]);
+            }
+            model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
+            // flag tensors to not be written in the weight file
+            model.SetNotWritableInitializedTensor(nameA);
+            model.SetNotWritableInitializedTensor(nameB);
+            fIsOutputConstant = true;
+            if (model.Verbose()) {
+               std::cout << BinaryOperatorTrait<T, Op>::Name() << " : " << fNA << "  " << ConvertShapeToString(fShapeA)
+                         << " , " << fNB << "  " << ConvertShapeToString(fShapeB) << " ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << " : " << ConvertValuesToString(dataY) << std::endl;
+            }
+         } else {
+            // case of defined and non-constant tensors
+            model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY);
+            if (model.Verbose()) {
+               std::cout << BinaryOperatorTrait<T, Op>::Name() << " : " << fNA << "  " << ConvertShapeToString(fShapeA)
+                         << " , " << fNB << "  " << ConvertShapeToString(fShapeB) << " ---> " << fNY << "  "
+                         << ConvertShapeToString(fShapeY) << std::endl;
             }
+            // we convert non-dim shapes to Dim shapes
+            fDimShapeY = ConvertShapeToDim(fShapeY);
          }
       } else {
-         fShapeY = fShapeA;
-      }
-      // check case of constant  output (if all inputs are defined)
-      if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB)) {
-         const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
-         const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
-         auto dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
-         auto dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
-         std::vector<T> dataY(ConvertShapeToLength(fShapeY));
-         for (size_t i = 0; i < dataY.size(); i++) {
-            dataY[i] = BinaryOperatorTrait<T,Op>::Func(dataA[i], dataB[i]);
+         // case A or B have dynamic shapes. We need to broadcast if shape are not same
+         auto ret = UTILITY::MultidirectionalBroadcastShape(fDimShapeA, fDimShapeB);
+         fBroadcastFlag = ret.first;
+         fDimShapeY = ret.second;
+         // case of all parametric shapes and MultiDirectionalBroadcastShape  return the max of the 2
+         // need to do before we declare the output tensor shape and the broadcasted ones
+         if (ret.first & 4) {
+            // check if one of the parameter is an input dimension
+            // define function to find this
+            auto IsInputDimParam = [&](const std::string &p) {
+               auto inputNames = model.GetInputTensorNames();
+               for (auto &input : inputNames) {
+                  for (auto &i_s : model.GetDimTensorShape(input)) {
+                     if (i_s.isParam && i_s.param == p)
+                        return true;
+                  }
+               }
+               return false;
+            };
+            for (size_t i = 0; i < fDimShapeY.size(); i++) {
+               auto &s = fDimShapeY[i];
+               if (s.isParam && s.param.find("std::max") != std::string::npos) {
+                  if (IsInputDimParam(fDimShapeA[i].param)) {
+                     // case dim is 1 we indicate that the input parameter is equal to 1
+                     if (fDimShapeA[i].dim != 1)
+                        s = fDimShapeA[i];
+                     else
+                        s = fDimShapeB[i];
+                  } else if (IsInputDimParam(fDimShapeB[i].param)) {
+                     if (fDimShapeB[i].dim != 1)
+                        s = fDimShapeB[i];
+                     else
+                        s = fDimShapeA[i];
+                  }
+               }
+            }
+         }
+
+         model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fDimShapeY);
+         if (model.Verbose()) {
+            std::cout << BinaryOperatorTrait<T, Op>::Name() << " : " << ConvertDimShapeToString(fDimShapeA) << " , "
+                      << ConvertDimShapeToString(fDimShapeB) << " --> " << ConvertDimShapeToString(fDimShapeY) << std::endl;
          }
-         model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
-         // flag tensors to not be written in a fil
-         model.SetNotWritableInitializedTensor(nameA);
-         model.SetNotWritableInitializedTensor(nameB);
-         fIsOutputConstant = true;
-         if (model.Verbose())
-            std::cout << "Binary op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << " : "
-               << ConvertValuesToString(dataY) << std::endl;
-      }
-      else {
-        model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY);
       }
    }
 
-   std::string GenerateInitCode() override {
+   std::string GenerateInitCode() override
+   {
       std::stringstream out;
       return out.str();
    }
 
-   std::string Generate(std::string OpName) override {
+   std::string Generate(std::string opName) override
+   {
 
-      if (fIsOutputConstant) return "";
+      if (fIsOutputConstant)
+         return "";
 
-      OpName = "op_" + OpName;
+      opName = "op_" + opName;
 
-      if (fShapeY.empty()) {
+      if (fDimShapeY.empty()) {
          throw std::runtime_error("TMVA SOFIE Binary Op called to Generate without being initialized first");
       }
       std::stringstream out;
-      out << SP << "\n//------ " << BinaryOperatorTrait<T,Op>::Name() << "\n";
-      size_t length = ConvertShapeToLength(fShapeY);
+      out << SP << "\n//------ " << opName << "  " << BinaryOperatorTrait<T, Op>::Name() << " --> "
+          << ConvertDimShapeToString(fDimShapeY) << "\n";
+      auto length = ConvertDimShapeToLength(fDimShapeY);
       std::string typeName = TensorType<T>::Name();
-      // Broadcast A if it's uninitialized
-      // use broadcasting function where we pass an already allocated tensor to minimize memory allocations
-      if (fShapeA != fShapeY) {
-         out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n";
-         out << SP  << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY)
-                         << ", fTensor_" << fNBroadcastedA << ");\n";
-      }
-      // Broadcast B if it's uninitialized
-      if (fShapeB != fShapeY) {
-         out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n";
-         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY)
-                   << ", fTensor_" << fNBroadcastedB << ");\n";
-      }
-      const std::string& nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
-      const std::string& nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
-      out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
-      out << SP << SP << "tensor_" << fNY << "[id] = "  << BinaryOperatorTrait<T,Op>::Op( "tensor_" + nameA + "[id]" , "tensor_" + nameB + "[id]") <<  " ;\n";
-      out << SP << "}\n";
+
+      // we need to check if we can broadcast (case flag has bit 4 set)
+
+      if (fBroadcastFlag & 4) {
+         // need to check if shapes are the same
+         auto lengthA = ConvertDimShapeToLength(fDimShapeA);
+         auto lengthB = ConvertDimShapeToLength(fDimShapeB);
+         out << SP << "if (" << lengthA << "!=" << lengthB << ") {\n";
+         // check if A->B or B->A
+         // bool broadcastable = true;
+         for (size_t i = 0; i < fDimShapeY.size(); i++) {
+            if (fBroadcastFlag & 5 && fDimShapeY[i] == fDimShapeA[i] && fDimShapeA[i].dim > 1 &&
+                fDimShapeB[i].isParam) {
+               // B->A B[i] needs to be 1
+               out << SP << SP << "if (" << fDimShapeB[i] << "!= 1)\n";
+               out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast B->A in operator "
+                   << opName << "\");\n";
+            }
+            if (fBroadcastFlag & 6 && fDimShapeY[i] == fDimShapeB[i] && fDimShapeB[i].dim > 1 &&
+                fDimShapeA[i].isParam) {
+               // A-> B A[i] needs to be 1
+               out << SP << SP << "if (" << fDimShapeA[i] << "!= 1)\n";
+               out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast A->B in operator "
+                   << opName << "\");\n";
+            } else if (fDimShapeA[i].isParam && fDimShapeB[i].isParam) {
+               // both shapes are parametric and we broadcast to maximum
+               // we allocate here output vector
+               out << SP << SP << "if (" << fDimShapeA[i] << " != " << fDimShapeB[i] << " && (" << fDimShapeA[i]
+                   << " != 1 || " << fDimShapeB[i] << " != 1))\n";
+               out << SP << SP << SP << "throw std::runtime_error(\"SOFIE - Cannot broadcast shapes in operator " << opName
+                   << "\");\n";
+            }
+         }
+         out << SP << "}\n";
+      }
+
+      auto stridesA = UTILITY::ComputeStrideFromShape(fDimShapeA);
+      auto stridesB = UTILITY::ComputeStrideFromShape(fDimShapeB);
+      auto stridesY = UTILITY::ComputeStrideFromShape(fDimShapeY);
+
+      std::string compute_idx_A, compute_idx_B, compute_idx_Y;
+      if (fDimShapeA.empty() ||
+          std::all_of(fDimShapeA.begin(), fDimShapeA.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_A = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeA.size(); ++i) {
+            if (fDimShapeA[i].dim == 1 || fDimShapeA[i].GetVal() == "1")
+               continue;
+            compute_idx_A += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeA.size()));
+            if (stridesA[i].GetVal() != "1")
+               compute_idx_A += " * " + stridesA[i].GetVal();
+            compute_idx_A += " + ";
+         }
+         // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_A.pop_back();
+      }
+      if (fDimShapeB.empty() ||
+          std::all_of(fDimShapeB.begin(), fDimShapeB.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_B = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeB.size(); ++i) {
+            if (fDimShapeB[i].dim == 1 || fDimShapeB[i].GetVal() == "1")
+               continue;
+            compute_idx_B += "idx_" + std::to_string(i + (fDimShapeY.size() - fDimShapeB.size()));
+            if (stridesB[i].GetVal() != "1")
+               compute_idx_B += " * " + stridesB[i].GetVal();
+            compute_idx_B += " + ";
+         }
+          // remove last 3 character " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_B.pop_back();
+      }
+      int nloop = 0;
+      if (fDimShapeY.empty() ||
+          std::all_of(fDimShapeY.begin(), fDimShapeY.end(), [](Dim d) { return d.dim == 1 || d.GetVal() == "1"; })) {
+         compute_idx_Y = "0";
+      } else {
+         for (size_t i = 0; i < fDimShapeY.size(); ++i) {
+            if (fDimShapeY[i].dim != 1 && fDimShapeY[i].GetVal() != "1") {
+               nloop++;
+               for (int j = 0; j < nloop; j++) out << SP;
+               out << "for (size_t idx_" << i << " = 0; idx_" << i << " < " << fDimShapeY[i]
+                   << "; ++idx_" << i << "){\n";
+               compute_idx_Y += "idx_" + std::to_string(i);
+               if (stridesY[i].GetVal() != "1")
+                  compute_idx_Y += " * " + stridesY[i].GetVal();
+               compute_idx_Y += " + ";
+            }
+         }
+         // remove last 3 characters " + "
+         for (int j = 0; j < 3; j++)
+            compute_idx_Y.pop_back();
+      }
+      for (int j = 0; j < nloop + 1; j++) out << SP;
+      out << "tensor_" << fNY << "[" << compute_idx_Y << "] = "
+          << BinaryOperatorTrait<T, Op>::Op("tensor_" + fNA + "[" + compute_idx_A + "]",
+                                            "tensor_" + fNB + "[" + compute_idx_B + "]")
+          << " ;\n";
+
+      for (int i = nloop; i > 0; i--) {
+         for (int j = 0; j < i; j++) out << SP;
+         out << "}\n";
+      }
       return out.str();
    }
 
-   std::vector<std::string> GetStdLibs() override {
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) {
+      if (fIsOutputConstant)
+         return "";
+
+      std::string op;
+      op = "\n//------ "+opName+"_"+BinaryOperatorTrait<T, Op>::Name()+"_KERNEL_ALPAKA\n";
+      op += SP + "struct Binary"+opName+BinaryOperatorTrait<T, Op>::Name()+"Kernel {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const * A, T const * B, T * C) const {\n";
+      op += SP + SP + SP + "auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (idx < " + ConvertShapeToLength(fShapeY) + ") {\n";
+      auto stridesA = UTILITY::ComputeStrideFromShape(fShapeA);
+      auto stridesB = UTILITY::ComputeStrideFromShape(fShapeB);
+
+      for(size_t id_s = 0; id_s < stridesA.size(); ++id_s){
+         if(fShapeA[id_s] == 1)
+            stridesA[id_s] = 0;
+      }
+
+      for(size_t id_s = 0; id_s < stridesB.size(); ++id_s){
+         if(fShapeB[id_s] == 1)
+            stridesB[id_s] = 0;
+      }
+
+      auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY);
+
+      std::string flattened_index_A = "";
+      std::string flattened_index_B = "";
+      std::string temp = "idx";
+
+      op += "// stridesY " + ConvertShapeToString(stridesY) + "\n";
+      op += "// stridesA " + ConvertShapeToString(stridesA) + "\n";
+      op += "// stridesB " + ConvertShapeToString(stridesB) + "\n";
+
+      for (size_t id_s = 0; id_s < fShapeA.size(); ++id_s) {
+
+         auto strideY = stridesY[id_s];
+         auto strideA = stridesA[id_s];
+
+         // coord expression
+         std::string coord = "(int)(" + temp + " / " + std::to_string(strideY) + ")";
+
+         // accumulate into final index
+         flattened_index_A += coord + " * " + std::to_string(strideA) + " + ";
+
+         // update temp correctly
+         temp = temp + " - (" + coord + " * " + std::to_string(strideY) + ")";
+      }
+
+      // remove trailing " + "
+      if (!flattened_index_A.empty())
+         flattened_index_A.erase(flattened_index_A.size() - 3);
+
+      temp = "idx";
+
+      for (size_t id_s = 0; id_s < fShapeB.size(); ++id_s) {
+
+         auto strideY = stridesY[id_s];
+         auto strideB = stridesB[id_s];
+
+         // coord expression
+         std::string coord = "(int)(" + temp + " / " + std::to_string(strideY)  + ")";
+
+         // accumulate into final index
+         flattened_index_B += coord + " * " + std::to_string(strideB) + " + ";
+
+         // update temp correctly
+         temp = temp + " - (" + coord + " * " + std::to_string(strideY) + ")";
+      }
+
+      // remove trailing " + "
+      if (!flattened_index_B.empty())
+         flattened_index_B.erase(flattened_index_B.size() - 3);
+         
+
+      op += "C[idx] = " + BinaryOperatorTrait<T, Op>::Op("A["+flattened_index_A+"]", "B["+flattened_index_B+"]") + ";\n";
+      op += "}\n}\n};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) {
+      if (fIsOutputConstant)
+         return "";
+
+      return SP + "Binary"+OpName+BinaryOperatorTrait<T, Op>::Name()+"Kernel binary" + OpName + "Kernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) {
+      if (fIsOutputConstant)
+         return "";
+
+      if (fDimShapeY.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Basic Binary called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fDimShapeY);
+      out << "\n//------ "+OpName+"_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNY<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNY<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << fNY << " = {elementsPerGrid_" << fNY << ", elementsPerThread_" << fNY << "};\n";
+      out << SP << "auto const workDiv_" << fNY << " = alpaka::getValidWorkDiv(kernelCfg_" << fNY << ", devAcc, binary" << OpName << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNA
+         << "), alpaka::getPtrNative(deviceBuf_" << fNB << "), alpaka::getPtrNative(deviceBuf_" << fNY << "));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNY
+         << ", binary" << OpName << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNA
+         << "), alpaka::getPtrNative(deviceBuf_" << fNB << "), alpaka::getPtrNative(deviceBuf_" << fNY << "));\n";
+      out << SP <<"alpaka::wait(queue);\n";
+      return out.str();
+   }
+
+   std::vector<std::string> GetStdLibs() override
+   {
       if (Op == EBasicBinaryOperator::Pow) {
-         return { std::string("cmath") };
+         return {std::string("cmath")};
       } else {
          return {};
       }
    }
-};
 
-}//SOFIE
+   
+};
 
+} // namespace SOFIE
 
-#endif //SOFIE_ROperator_BasicBinary
+#endif // SOFIE_ROperator_BasicBinary
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx
index c18c17e..b98ded5 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_BasicUnary.hxx
@@ -107,6 +107,33 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA() {
+      std::string op;
+      op = "\n//------ " + UnaryOpTraits<T, Op>::Name() + "_KERNEL_ALPAKA\n";
+      op += SP + "struct Unary" + UnaryOpTraits<T, Op>::Name() + "Kernel{\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T* data, std::size_t numElements) const {\n";
+      op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n";
+      op += SP + SP + SP + "data[i] = " << UnaryOpTraits<T, Op>::Op("data[i]") << ";\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "Unary" + UnaryOpTraits<T, Op>::Name() + "Kernel " + UnaryOpTraits<T, Op>::Name() + "Kernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShapeX);
+      out << "\n//------ "+OpName+"_ALPAKA\n";
+      out << SP << "alpaka::WorkDivMembers<Dim, Idx> workDiv_"<<fNX<<"(alpaka::Vec<Dim, Idx>::all("<<(length+255)/256<<"), alpaka::Vec<Dim, Idx>::all(256), alpaka::Vec<Dim, Idx>::all(1));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNX << ", " << UnaryOpTraits<T, Op>::Name() << "Kernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), static_cast<Idx>(" << length << ")); \n";
+      return out.str();
+   }
+
    std::vector<std::string> GetStdLibs() override {
       if (Op == EBasicUnaryOperator::kSqrt || Op == EBasicUnaryOperator::kExp || Op == EBasicUnaryOperator::kLog) {
          return { std::string("cmath") };
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx
index a27cea4..1a6098d 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_BatchNormalization.hxx
@@ -1,9 +1,9 @@
 #ifndef SOFIE_ROPERATOR_BatchNormalization
 #define SOFIE_ROPERATOR_BatchNormalization
 
-#include "SOFIE_common.hxx"
-#include "ROperator.hxx"
-#include "RModel.hxx"
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
 
 
 #include <cmath>
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx
index 47c3d66..2cb797b 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Cast.hxx
@@ -10,6 +10,14 @@
 
 namespace SOFIE{
 
+template <typename In>
+std::vector<int64_t> convertToInt64(const In* src, size_t n) {
+   std::vector<int64_t> dst(n);
+   std::transform(src, src + n, dst.begin(),
+                  [](In v) { return static_cast<int64_t>(v); });
+   return dst;
+}
+
 
 class ROperator_Cast final : public ROperator
 {
@@ -26,6 +34,7 @@ public:
    ROperator_Cast(std::string attr_type,std::string nameX, std::string nameY):
    fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)),
    fAttrType(attr_type) {
+      fKind = OperatorKind::CAST;
       fInputTensorNames = { fNX };
       fOutputTensorNames = { fNY };
    }
@@ -47,11 +56,67 @@ public:
       fShape = model.GetTensorShape(fNX);
       // shoud we add a check if the same type
       auto inputType = model.GetTensorType(fNX);
+      const size_t n = ConvertShapeToLength(fShape);
       if (model.IsInitializedTensor(fNX)) {
          fIsOutputConstant = true;
          auto inputData = model.GetInitializedTensorData(fNX);
          if (ConvertStringToType(fAttrType) == ETensorType::INT64) {
-            model.AddConstantTensor<int64_t>(fNY, fShape, static_cast<int64_t*>(inputData.get()));
+            auto inputTypeStr = ConvertTypeToString(inputType);
+            if (inputTypeStr == "int32_t") {
+               auto* src = static_cast<const int32_t*>(inputData.get());
+               auto converted = convertToInt64(src, n);
+               model.AddConstantTensor(fNY, fShape, converted);
+            }
+            else if (inputTypeStr == "float") {
+               auto* src = static_cast<const float*>(inputData.get());
+               auto converted = convertToInt64(src, n);
+               model.AddConstantTensor(fNY, fShape, converted);
+            }
+            else if (inputTypeStr == "double") {
+               auto* src = static_cast<const double*>(inputData.get());
+               auto converted = convertToInt64(src, n);
+               model.AddConstantTensor(fNY, fShape, converted);
+            }
+            else if (inputTypeStr == "int8_t") {
+               auto* src = static_cast<const int8_t*>(inputData.get());
+               auto converted = convertToInt64(src, n);
+               model.AddConstantTensor(fNY, fShape, converted);
+            }
+            else if (inputTypeStr == "int16_t") {
+               auto* src = static_cast<const int16_t*>(inputData.get());
+               auto converted = convertToInt64(src, n);
+               model.AddConstantTensor(fNY, fShape, converted);
+            }
+            else if (inputTypeStr == "uint8_t") {
+               auto* src = static_cast<const uint8_t*>(inputData.get());
+               auto converted = convertToInt64(src, n);
+               model.AddConstantTensor(fNY, fShape, converted);
+            }
+            else if (inputTypeStr == "uint16_t") {
+               auto* src = static_cast<const uint16_t*>(inputData.get());
+               auto converted = convertToInt64(src, n);
+               model.AddConstantTensor(fNY, fShape, converted);
+            }
+            else if (inputTypeStr == "uint32_t") {
+               auto* src = static_cast<const uint32_t*>(inputData.get());
+               auto converted = convertToInt64(src, n);
+               model.AddConstantTensor(fNY, fShape, converted);
+            }
+            else if (inputTypeStr == "uint64_t") {
+               auto* src = static_cast<const uint64_t*>(inputData.get());
+               auto converted = convertToInt64(src, n);
+               model.AddConstantTensor(fNY, fShape, converted);
+            }
+            else if (inputTypeStr == "int64_t") {
+               model.AddConstantTensor(
+                     fNY, fShape,
+                     static_cast<const int64_t*>(inputData.get())
+               );
+            }
+            else {
+               throw std::runtime_error("Unsupported input type for INT64 conversion");
+            }
+
             model.SetNotWritableInitializedTensor(fNX);
          }
          else
@@ -90,6 +155,45 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      std::string op;
+      op = "\n//------ CAST_KERNEL_ALPAKA\n";
+      op += SP + "struct CastKernel"+opName+"{\n";
+      op += SP + SP + "template<typename TAcc, typename SrcT, typename DstT>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, SrcT const * src, DstT * dst, std::size_t numElements) const {\n";
+      op += SP + SP + SP + "for (auto i : alpaka::uniformElements(acc, numElements)) {\n";
+      op += SP + SP + SP + "dst[i] = static_cast<DstT>(src[i]);\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      return SP + "CastKernel"+opName+" castKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      if (fIsOutputConstant) return "";
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Cast called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShape);
+      out << "\n//------ CAST_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNY<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNY<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << fNY << " = {elementsPerGrid_" << fNY << ", elementsPerThread_" << fNY << "};\n";
+      out << SP << "auto const workDiv_" << fNY << " = alpaka::getValidWorkDiv(kernelCfg_" << fNY << ", devAcc, castKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNY << ", castKernel, alpaka::getPtrNative(deviceBuf_" << fNX << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << ")); \n";
+      out << SP <<"alpaka::wait(queue);\n";
+      return out.str();
+   }
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx
index 7648a9a..a00ed28 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Comparision.hxx
@@ -73,6 +73,7 @@ public:
    ROperator_Comparision(){}
    ROperator_Comparision(const std::string & nameX1, const std::string & nameX2, const std::string & nameY):
       fNX1(UTILITY::Clean_name(nameX1)), fNX2(UTILITY::Clean_name(nameX2)), fNY(UTILITY::Clean_name(nameY)){
+         fKind = OperatorKind::COMPARISON;
          fInputTensorNames = { fNX1, fNX2 };
          
          // output will be a boolean vector so should not be considered for memory optimized pool
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx
index 0d5e574..10d6d0d 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Concat.hxx
@@ -1,5 +1,5 @@
 #ifndef SOFIE_ROPERATOR_Concat
- #define SOFIE_ROPERATOR_Concat
+#define SOFIE_ROPERATOR_Concat
 
 
  #include "SOFIE/SOFIE_common.hxx"
@@ -23,8 +23,10 @@
          std::string fOutput;
          std::vector<Dim>fOutputShape;
          std::vector<std::vector<Dim>> fInputShapes;
+         ETensorType fInputType;
 
      public:
+
          ROperator_Concat(){}
          ROperator_Concat(std::vector<std::string> inputs, int axis, int newAxis, std::string output):
          fAxis(axis), fnewAxis(newAxis), fOutput(UTILITY::Clean_name(output)) {
@@ -53,6 +55,7 @@
                throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value ");
 
             int concat_dim=0;
+            // case of Concat (fNewAxis = 0) and not ConcatFromSequence
             if(fnewAxis == 0){
                for (size_t i = 0; i < inputs.size(); i++) {
                   if (i > 0 && inputs[i].size() != inputs[i - 1].size())
@@ -73,6 +76,7 @@
                ret[0][fAxis] = concat_dim;
             }
             std::vector<int> stack;
+            // case ConCatFromSequence
             if(fnewAxis == 1){
                for(size_t i = 0; i < inputs.size(); i++) {
                   if (i > 0 && inputs[i].size() != inputs[i-1].size() )
@@ -96,8 +100,8 @@
          }
 
          // get shape of output given inputs. It is going to be called after initialized
-         std::vector<std::vector<Dim>> ShapeInference(const std::vector<std::vector<Dim>> & inputs) {
-            std::vector<std::vector<Dim>> ret(1);
+         std::vector<Dim> ShapeInference(const std::vector<std::vector<Dim>> & inputs, const RModel & model) {
+            std::vector<Dim> ret(inputs[0].size());
             // treat negative axis case
             if (fAxis<0) {
                fAxis = inputs[0].size()+fAxis;
@@ -105,31 +109,54 @@
             if (fAxis < 0 || fAxis >= (int) inputs[0].size())
                throw std::runtime_error("TMVA SOFIE Concat Op - invalid axis value ");
 
-            int concat_dim=0;
+            Dim concat_dim;
             if(fnewAxis == 0){
                for (size_t i = 0; i < inputs.size(); i++) {
                   if (i > 0 && inputs[i].size() != inputs[i - 1].size())
                      throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have different shapes " + fInputs[i] + " : " +
-                                              ConvertDynamicShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDynamicShapeToString(inputs[i - 1]));
+                                              ConvertDimShapeToString(inputs[i]) + " and " + fInputs[i-1] + " : " + ConvertDimShapeToString(inputs[i - 1]));
                   for (size_t iaxis = 0; iaxis < inputs[i].size(); iaxis++) {
                      if ((int)iaxis == fAxis) {
-                        // support only non-params shape for the concatenation axis
-                        if (inputs[i][iaxis].isParam)
-                           throw std::runtime_error("TMVA SOFIE Concat Op - not supporting input param dimensions for concatenation axis. Input shape is " +
-                                                     ConvertDynamicShapeToString(inputs[i]));
-                        concat_dim += inputs[i][iaxis].dim;
+                        // support both integer and params shape for the concatenation axis
+                        if (concat_dim.param.empty() && concat_dim.dim == 0)
+                           concat_dim = inputs[i][iaxis];
+                        else if (inputs[i][iaxis].isParam || concat_dim.isParam) {
+                           concat_dim =
+                              Dim{ concat_dim.GetVal() + std::string(" + ") + inputs[i][iaxis].GetVal(),
+                                 static_cast<size_t>(-1)};
+                        } else {
+                           concat_dim = Dim { concat_dim.dim + inputs[i][iaxis].dim };
+                        }
+                     }
+                     else if (i == 0) {
+                        ret[iaxis] = inputs[i][iaxis];
                      }
-                     // other dimensions must be the same
-                     else if (i > 0 && inputs[i][iaxis].GetVal() != inputs[i - 1][iaxis].GetVal())
+                     else if ((!inputs[i][iaxis].isParam && !ret[iaxis].isParam) && (inputs[i][iaxis].dim != ret[iaxis].dim)) {
                         throw std::runtime_error("TMVA SOFIE Concat Op - input tensors have wrong shapes " +
-                                                 ConvertDynamicShapeToString(inputs[i]) + " and " +
-                                                 ConvertDynamicShapeToString(inputs[i - 1]));
+                                                 ConvertDimShapeToString(inputs[i]) + " and " +
+                                                 ConvertDimShapeToString(inputs[i - 1]));
+                     }
+                     else if (!inputs[i][iaxis].isParam && ret[iaxis].isParam){
+                        // if shape is not parametric use it
+                        ret[iaxis] = inputs[i][iaxis];
+                     }
+                     else if (inputs[i][iaxis].isParam && ret[iaxis].isParam) {
+                        // check which parameter is first in RModel list
+                        auto & dimNames = model.GetDimShapeNames();
+                        auto p1 = std::find(dimNames.begin(), dimNames.end(), inputs[i][iaxis].param);
+                        auto p2 = std::find(dimNames.begin(), dimNames.end(), ret[iaxis].param);
+                        if (p1 < p2) ret[iaxis] = inputs[i][iaxis];
+                     }
+
                   }
+                  // add parenthesis in case is an expression
+                  if (concat_dim.isParam && concat_dim.dim == static_cast<size_t>(-1))
+                     concat_dim =  Dim{ std::string("(") + concat_dim.GetVal() +  std::string(")"), concat_dim.dim };
                }
 
-               // output shape
-               ret[0] = inputs[0];
-               ret[0][fAxis].dim = concat_dim;
+               // output shape for concatenated axis
+               ret[fAxis] = concat_dim;
+
             }
             // case of stacking (not supported yet)
             // here we need to check that input shapes are the same
@@ -141,24 +168,31 @@
             return ret;
          }
 
-      void Initialize(RModel& model) override {
+         void Initialize(RModel& model) override {
             for (auto &it : fInputs) {
                if (model.CheckIfTensorAlreadyExist(it) == false) {
                   throw std::runtime_error("TMVA SOFIE Concat Op Input Tensor " + it + " is not found in model");
                }
-               fInputShapes.push_back(model.GetDynamicTensorShape(it));
+               fInputShapes.push_back(model.GetDimTensorShape(it));
             }
-            fOutputShape = ShapeInference(fInputShapes)[0];
+            fOutputShape = ShapeInference(fInputShapes, model);
             if (model.Verbose())
-               std::cout << "Output of concat operator has shape " << ConvertDynamicShapeToString(fOutputShape) << std::endl;
+               std::cout << "Output of concat operator has shape " << ConvertDimShapeToString(fOutputShape) << std::endl;
 
             // check if concat has constant inputs , axis 0(concat contigous memory and type is integer)
+            bool isOutputShape = false;
+            fInputType = model.GetTensorType(fInputs[0]);
             if (model.GetTensorType(fInputs[0]) == ETensorType::INT64 && fAxis == 0) {
                fIsOutputConstant = true;
+               isOutputShape = true;
+
                for ( auto & input : fInputs) {
                   if (!model.IsInitializedTensor(input)) {
                      fIsOutputConstant = false;
-                     break;
+                     if (!model.IsShapeTensor(input)) {
+                        isOutputShape = false;
+                        break;
+                     }
                   }
                }
                if (fIsOutputConstant) {
@@ -171,32 +205,64 @@
                      size_t inputLength = ConvertShapeToLength(inputShape);
                      std::copy(inputData, inputData + inputLength, outputData.begin() + offset );
                      offset += inputLength;
-                     // data do not need to be written as a weight
+                     // the data of the input tensor don't need to be written in the generated code and data file
                      model.SetNotWritableInitializedTensor(input);
                   }
                   model.AddConstantTensor<int64_t>(fOutput, outputShape, outputData.data());
                   if (model.Verbose()) {
                      std::cout << "output of Concat is a constant tensor " << ConvertShapeToString(outputShape) << " : "
-                     << ConvertValuesToString(outputData) << std::endl;
+                     << ConvertValuesToString(outputData) << " (constant)" << std::endl;
                   }
+               } else if (isOutputShape) {
+                  auto outputShape = ConvertShapeToInt(fOutputShape);  // conversion must be possible
+                  std::vector<Dim> outputData(ConvertShapeToLength(outputShape));
+                  size_t offset = 0;
+                  for ( auto & input : fInputs) {
+                     std::vector<Dim> inputData;
+                     auto inputShape = model.GetTensorShape(input); // shape is not dynamic
+                     size_t inputLength = ConvertShapeToLength(inputShape); // shape can be a scalar
+                     if (model.IsShapeTensor(input)) {
+                        inputData = model.GetShapeTensorValues(input);
+                     } else if (model.IsInitializedTensor(input)) {
+                        inputData.resize(inputLength);
+                        auto intData = static_cast<int64_t*>(model.GetInitializedTensorData(input).get());
+                        for (size_t i = 0; i < inputData.size(); i++)
+                           inputData[i] = Dim{ static_cast<size_t>(intData[i])};
+                     }
+                     else {
+                        // this should not happen
+                        throw std::runtime_error("TMVA SOFIE Concat Operator- invalid input type for shape output type");
+                     }
+                     std::copy(inputData.begin(), inputData.end(), outputData.begin() + offset );
+                     offset += inputLength;
+                  }
+                  // add output tensor
+                  model.AddShapeTensor(fOutput,outputData, false); // cannot be a  scalar
+                  if (model.Verbose()) {
+                     std::cout << "output of Concat is a shape tensor " << ConvertShapeToString(outputShape) << " : "
+                     << ConvertDimShapeToString(outputData) << " (shape)" <<  std::endl;
+                  }
+                  fIsOutputConstant = true;
                }
             }
             if (!fIsOutputConstant) {
                model.AddIntermediateTensor(fOutput, model.GetTensorType(fInputs[0]), fOutputShape);
                if (model.Verbose()) {
-                  std::cout << "Concat ---> " << fOutput << " " <<  ConvertDynamicShapeToString(fOutputShape) << std::endl;
+                  std::cout << "Concat ---> " << fOutput << " " <<  ConvertDimShapeToString(fOutputShape) << std::endl;
                }
             }
          }
 
-         std::string Generate(std::string OpName) override {
-            if (fIsOutputConstant) return "";
-            OpName = "op_"+OpName;
+         std::string Generate(std::string opName) override {
+            opName = "op_" + opName;
+            std::stringstream out;
+            out<<"\n//--------- Concat " << opName << " --> " << fOutput << "  " << ConvertDimShapeToString(fOutputShape) << "\n";
+
+            if (fIsOutputConstant) return out.str();
+
             if(fOutputShape.empty()){
                   throw std::runtime_error("TMVA SOFIE Concat called to Generate without being initialized first");
             }
-            std::stringstream out;
-            out<<"\n//--------- Concat\n";
             // special case when memory is contiguous
             bool hasShapeOnes = true;
             for(int i = 0; i<fAxis; ++i){
@@ -208,7 +274,7 @@
             if (fAxis == 0 || hasShapeOnes) {
                std::string offset;
                for(size_t i=0; i<fInputs.size(); ++i) {
-                  std::string length = ConvertDynamicShapeToLength(fInputShapes[i]);
+                  auto length = ConvertDimShapeToLength(fInputShapes[i]);
                   out << SP << "std::copy(tensor_" <<fInputs[i] << ", tensor_" <<fInputs[i] << "+" << length <<", tensor_"<<fOutput;
                   if (i > 0)  out << offset;
                   offset += " + " + length;
@@ -238,14 +304,14 @@
 
                for (size_t j = 0; j < fInputs.size(); j++) {
                   if (j>0)
-                  out << SP << SP << SP << "idxOut += " << fInputShapes[j-1][fAxis].GetVal() << ";\n";
+                  out << SP << SP << SP << "idxOut += " << inStrides[j-1][fAxis-1].GetVal() << ";\n";
                   out << SP << SP << SP << "int idxIn" << j <<" = ";
                   for (int k = 0; k < fAxis; k++) {
                      if (k > 0) out << " + ";
                      out << inStrides[j][k].GetVal() << "*i" << k;
                   }
                   out << ";\n";
-                  out << SP << SP << SP << "for (size_t iC = 0; iC < " << fInputShapes[j][fAxis].GetVal() << "; ++iC) {\n";
+                  out << SP << SP << SP << "for (size_t iC = 0; iC < " << inStrides[j][fAxis-1].GetVal() << "; ++iC) {\n";
                   out << SP << SP << SP << SP << "tensor_" << fOutput << "[idxOut+iC] = tensor_" << fInputs[j] << "[idxIn" << j << "+iC];\n";
                   out << SP << SP << SP << "}\n";
                // concatenate the axis values
@@ -257,7 +323,131 @@
 
             return out.str();
          }
-     };
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      if (fIsOutputConstant) return "";
+      opName = "op_" + opName;
+      if (fOutputShape.empty())
+         throw std::runtime_error("TMVA SOFIE Operator Concat called to Generate without being initialized first");
+
+      const std::size_t D   = fOutputShape.size();
+      const std::size_t Nin = fInputs.size();
+
+      auto outStrides = UTILITY::ComputeStrideFromShape(fOutputShape);
+
+      std::vector<std::size_t> prefix(Nin);
+      prefix[0] = 0;
+      for (std::size_t k = 1; k < Nin; ++k)
+         prefix[k] = prefix[k - 1] + std::stoul(fInputShapes[k - 1][fAxis].GetVal());
+
+      std::vector<std::vector<Dim>> inStrides(Nin);
+      for (std::size_t k = 0; k < Nin; ++k)
+         inStrides[k] = UTILITY::ComputeStrideFromShape(fInputShapes[k]);
+
+      std::string op;
+      op  = "\n//------ CONCAT_KERNEL_ALPAKA\n";
+      op += SP + "struct ConcatKernel_" + opName + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "std::array<T const*, " + std::to_string(Nin) + "> inputs,\n";
+      op += SP + SP + SP + "T* output,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "std::size_t remaining;\n";
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      op += SP + SP + SP + SP + "remaining = elem_idx;\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         std::string stride_val = outStrides[d].GetVal();
+         op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+               + " = remaining / " + stride_val + "u;\n";
+         op += SP + SP + SP + SP + "remaining -= out_" + std::to_string(d)
+               + " * " + stride_val + "u;\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "std::size_t chosen = 0;\n";
+      for (std::size_t k = 0; k < Nin; ++k) {
+         std::size_t end_k = prefix[k] + std::stoul(fInputShapes[k][fAxis].GetVal());
+         op += SP + SP + SP + SP + "chosen += static_cast<std::size_t>("
+               + std::to_string(end_k) + "u <= out_" + std::to_string(fAxis) + ");\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "std::size_t const output_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + SP + "out_" + std::to_string(d)
+               + " * " + outStrides[d].GetVal() + "u";
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+      for (std::size_t k = 0; k < Nin; ++k) {
+         op += SP + SP + SP + SP + SP + "(chosen == " + std::to_string(k) + "u) * (\n";
+         for (std::size_t d = 0; d < D; ++d) {
+               std::string coord = (d == static_cast<std::size_t>(fAxis))
+                  ? ("(out_" + std::to_string(d) + " - " + std::to_string(prefix[k]) + "u)")
+                  : ("out_" + std::to_string(d));
+               op += SP + SP + SP + SP + SP + SP + coord
+                  + " * " + inStrides[k][d].GetVal() + "u";
+               op += (d + 1 < D) ? " +\n" : "\n";
+         }
+         op += SP + SP + SP + SP + SP + ")";
+         op += (k + 1 < Nin) ? " +\n" : ";\n\n";
+      }
+
+      op += SP + SP + SP + SP + "output[output_idx] = inputs[chosen][input_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      return SP + "ConcatKernel_" + opName + " concatKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fOutputShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Concat called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fOutputShape);
+      out << "\n//------ CONCAT_GPU_ALPAKA\n";
+      switch (fInputType){
+         case ETensorType::FLOAT:
+            out << SP << "std::array<const float *, " << fInputs.size() << "> input_ptrs_" << OpName << " = {"; break;
+         case ETensorType::INT64:
+            out << SP << "std::array<const int64_t *, " << fInputs.size() << "> input_ptrs_" << OpName << " = {"; break;
+         default: 
+            throw std::runtime_error("Data type for Concat operator is not yet supported.");
+      }
+      for(size_t i=0; i<fInputs.size(); ++i){
+         if(i>0) out << ", ";
+         out << "alpaka::getPtrNative(deviceBuf_" << fInputs[i] << ")";
+      }
+      out << "};\n";
+
+      out << SP << "auto const elementsPerThread_"<<OpName<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<OpName<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << OpName << " = {elementsPerGrid_" << OpName << ", elementsPerThread_" << OpName << "};\n";
+      out << SP << "auto const workDiv_" << OpName << " = alpaka::getValidWorkDiv(kernelCfg_" << OpName << ", devAcc, concatKernel_" << OpName << ", input_ptrs_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fOutput << "), static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << OpName
+         << ", concatKernel_" << OpName << ", input_ptrs_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fOutput << "), static_cast<Idx>(" << length << "));\n";
+      out << SP <<"alpaka::wait(queue);\n";
+      return out.str();
+   }
+
+ };
  }//SOFIE
 
+
  #endif //SOFIE_ROPERATOR_CONCAT
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx
index 0d08432..6590909 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Constant.hxx
@@ -101,6 +101,11 @@ public:
       // no code to generate here. Tensor are defined in Session constructor
       return "//---------------------------------------\n";
    }
+
+   std::string Generate_GPU_ALPAKA(std::string /* OpName */) override {
+      // no code to generate here. Tensor are defined in Session constructor
+      return "//---------------------------------------\n";
+   }
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx
index 0467385..b9d917b 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_ConvTranspose.hxx
@@ -1,9 +1,9 @@
 #ifndef SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX
 #define SOFIE_SOFIE_ROPERATOR_CONVTRANSPOSE_HXX
 
-#include <SOFIE/SOFIE_common.hxx>
-#include <SOFIE/ROperator.hxx>
-#include <SOFIE/RModel.hxx>
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
 
 #include <memory>
 #include <sstream>
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx
index e9b555b..901bff8 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Einsum.hxx
@@ -41,6 +41,7 @@ public:
    ROperator_Einsum(const std::string & equation, const std::vector<std::string> & namesX, const std::string & nameY):
       fNInputs(namesX.size()), fNY(UTILITY::Clean_name(nameY))
    {
+      fKind = OperatorKind::EINSUM;
       for (size_t i = 0; i < namesX.size(); i++)
          fNInputs[i] = UTILITY::Clean_name(namesX[i]);
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx
index 34e18a6..dcbfd68 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Elu.hxx
@@ -27,6 +27,7 @@ public:
    ROperator_Elu(float alpha,std::string nameX, std::string nameY):
    falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
    {
+      fKind = OperatorKind::ELU;
       fInputTensorNames = { fNX };
       fOutputTensorNames = { fNY };
       
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx
index c834a06..786556d 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Expand.hxx
@@ -14,9 +14,10 @@ template<typename T>
 class ROperator_Expand final : public ROperator{
 private:
 
-   std::vector<size_t> fShapeX;
+   std::vector<Dim> fShapeX;
    std::vector<size_t> fShape;
-   std::vector<size_t> fShapeY;
+   std::vector<Dim> fShapeY;
+   std::vector<Dim> fShapeDim;
 
    std::string fNX;
    std::string fNShape;
@@ -24,6 +25,8 @@ private:
    std::string fType;
 
    bool fInitialized = false;
+   bool fInitializedShape = false;
+   bool fInitBroadcast = false;
 
 public:
    ROperator_Expand(){}
@@ -33,97 +36,318 @@ public:
          fOutputTensorNames = { fNY };
       }
 
-   // type of output given input
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
-      return input;
-   }
-
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      return input;
-   }
 
    void Initialize(RModel& model) override {
       // input must be a graph input, or already initialized intermediate tensor
       if (!model.CheckIfTensorAlreadyExist(fNX)) {
         throw std::runtime_error("TMVA SOFIE Expand Op Input Tensor " + fNX + " is not found in model");
       }
-      fShapeX = model.GetTensorShape(fNX);
-      if (!model.IsInitializedTensor(fNShape)) {
-         throw std::runtime_error("TMVA::SOFIE - Tensor " + fNShape + " is not initialized.");
-      }
-      int64_t *shapeData =
+      fShapeX = model.GetDimTensorShape(fNX);
+      if (model.IsInitializedTensor(fNShape)) {
+         fInitializedShape = true;
+         int64_t *shapeData =
            static_cast<int64_t *>(model.GetInitializedTensorData(fNShape).get());
-      fShape = model.GetTensorShape(fNShape);
-      if (fShape.size() != 1) {
-         throw std::runtime_error("TMVA::SOFIE - Expand operator shape must be a 1d tensor.");
+         fShape = model.GetTensorShape(fNShape);
+         if (fShape.size() != 1) {
+            throw std::runtime_error("TMVA::SOFIE - Expand operator shape must be a 1d tensor.");
+         }
+         size_t N = fShape[0];
+         // what do we do if shapeData contains negative values?
+         for (size_t i = 0; i < N; i++) {
+            if ( shapeData[i] < 0)
+               throw std::runtime_error("TMVA::SOFIE - Expand: invalid shape value " + std::to_string(shapeData[i]));
+         }
+         std::vector<size_t> shape(shapeData, shapeData + N);
+         fShapeDim = ConvertShapeToDim(shape);
+      } else if (model.IsShapeTensor(fNShape)) {
+         // case input shape is a shape tensor
+         fShapeDim = model.GetShapeTensorValues(fNShape);
+         fInitializedShape = true;
+      } else {
+         // assume shape of input shape is known (size is 1)
+         auto shapeOfInputShape = model.GetTensorShape(fNShape);
+         fShapeDim.resize(shapeOfInputShape[0]);
+         for (size_t i = 0; i < fShapeDim.size(); i++) {
+            fShapeDim[i] = Dim{std::string("v_") + fNShape + "_" + std::to_string(i)};
+            model.AddShapeParam(fShapeDim[i].param);
+         }
       }
-      size_t N = fShape[0];
-      std::vector<size_t> shape(shapeData, shapeData + N);
       // Y is the common shape of fShapeX and shape
-      fShapeY = SOFIE::UTILITY::UnidirectionalBroadcastShape(
-        fShapeX, shape);
-      fInitialized = model.IsInitializedTensor(fNX);
-      // Broadcast X to the common shape fShapeY
-      bool broadcast = !UTILITY::AreSameShape(fShapeX, fShapeY);
-      if (model.IsInitializedTensor(fNX)) {
+      auto ret  = SOFIE::UTILITY::MultidirectionalBroadcastShape(fShapeX, fShapeDim);
+      fShapeY = ret.second;
+      fInitialized = model.IsInitializedTensor(fNX) && fInitializedShape;
+      std::vector<size_t> shapeX;
+      std::vector<size_t> shapeY;
+      // case shape tensor and input shape are known
+      if (!model.IsDynamicTensor(fNX) && !model.IsDimInputTensor(fNX) && fInitializedShape) {
+         shapeX = ConvertShapeToInt(fShapeX);
+         shapeY = ConvertShapeToInt(fShapeY);
+         if (!UTILITY::AreSameShape(shapeX, shapeY))
+            fInitBroadcast = true;
+      }
+      if (fInitialized) {
+         // cannot have Dim initialized tensors
+         assert(!shapeX.empty() && !shapeY.empty());
+         // Broadcast X to the common shape shapeY
          // If X is an initialized tensor (constant)
          auto data = model.GetInitializedTensorData(fNX);
-         if (broadcast) {
+         if (fInitBroadcast) {
             std::shared_ptr<void> broadcastedData(
-               UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeX, fShapeY),
+               UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), shapeX, shapeY),
                std::default_delete<T[]>());
             // Update the data and the shape of X
-            model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), fShapeY, broadcastedData);
+            model.UpdateInitializedTensor(fNX, model.GetTensorType(fNX), shapeY, broadcastedData);
             fShapeX = fShapeY;
             // need to set as a not writable tensor
             model.SetNotWritableInitializedTensor(fNX);
             data = broadcastedData;
          }
-         if (broadcast || model.IsConstantTensor(fNX)) {
+         if (fInitBroadcast || model.IsConstantTensor(fNX)) {
             fIsOutputConstant = true; // constant output in this case
-            model.AddConstantTensor(fNY, model.GetTensorType(fNX), fShapeY, data);
+            model.AddConstantTensor(fNY, model.GetTensorType(fNX), shapeY, data);
             fOutputTensorNames.pop_back();
          } else {
-            model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
+            model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), shapeY);
          }
       } else {
-         // case input is not initialized
-         model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
+         // // case input is not initialized
+         // if (shapeX.empty() && shapeDim.empty()) {
+
+         // }
+         // if (fInitializedShape)
+            model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
       }
       fType = ConvertTypeToString(model.GetTensorType(fNX));
-      if (model.Verbose())
-         std::cout << "Expand - output is with shape " << ConvertShapeToString(fShapeY) << std::endl;      
+      if (model.Verbose()) {
+         std::cout << "Expand - input " << fNX << " shape " << ConvertDimShapeToString(fShapeX) << " --> " << fNY << " shape "
+                  << ConvertDimShapeToString(fShapeY) << (fIsOutputConstant ? ConvertValuesToString(model.GetTensorData<T>(fNY)) + " (constant)" : "") << std::endl;
+      }
+
+      if (fInitializedShape && model.IsInitializedTensor(fNShape)) {
+         // Shape values are fully consumed into fShapeY/fShapeDim at generation time —
+         // no device buffer needed for fNShape for Heterogeneous inference
+         model.SetNotWritableInitializedTensor(fNShape);
+      }
    }
 
    std::string GenerateInitCode() override {
       std::stringstream out;
-      if (!fIsOutputConstant && (fInitialized || fShapeX == fShapeY  ) ) {
-         size_t length = ConvertShapeToLength(fShapeY);
+      if (!fIsOutputConstant && fInitialized && !fInitBroadcast) {
+         // shapeX and shapeY are the same in this case
+         auto length = ConvertDimShapeToLength(fShapeY);
          out << "// Copying initialized tensor " << fNX << " to " << fNY << "\n";
          out << SP << "std::copy(tensor_" << fNX << ", " << "tensor_" << fNX << " + " << length << ", tensor_" << fNY << ");\n";
       }
       return out.str();
    }
 
-   std::string Generate(std::string OpName) override {
+   std::string Generate(std::string opName) override {
       if (fIsOutputConstant) return "";
-      OpName = "op_" + OpName;
+      opName = "op_" + opName;
       if (fShapeY.empty()) {
          throw std::runtime_error("TMVA SOFIE Expand Op called to Generate without being initialized first");
       }
       std::stringstream out;
-      out << SP << "\n//------ Expand Op" << "\n";
+      out << SP << "\n//------ Expand " << opName << " --> " << ConvertDimShapeToString(fShapeY) << "\n";
+      // need to declare shape parameters for non initialized shapes
+      if (!fInitializedShape) {
+         for (size_t i = 0; i < fShapeDim.size(); i++) {
+            out << SP << "size_t " << fShapeDim[i] << " = " << "tensor_" << fNShape << "[" << i << "];\n";
+         }
+      }
       // No need to broadcast A if it's an initialized tensor or shapes are the same
       if (!fInitialized && fShapeX != fShapeY) {
          out << SP << "// Broadcasting uninitialized tensor " << fNX << "\n";
-         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << fType << ">(tensor_" << fNX << ", " << ConvertShapeToString(fShapeX) << ", " << ConvertShapeToString(fShapeY)
-                   << ", std::span<"<<fType<<">(tensor_"<<fNY<<", "<<ConvertShapeToLength(fShapeY)<<"));\n";                   
+         out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNX << ", " << ConvertDimShapeToString(fShapeX) << ", " << ConvertDimShapeToString(fShapeY)
+                   << ", tensor_"<<fNY<<");\n";
       }
       return out.str();
    }
 
-};
+std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    if (fInitialized) return "";
+
+    opName = "op_" + opName;
+    if (fShapeY.empty())
+        throw std::runtime_error("TMVA SOFIE Expand Op called to Generate without being initialized first");
+
+    // Can only generate a static kernel if all dimensions are concrete values
+    auto isStatic = [](const std::vector<Dim>& shape) {
+        return std::all_of(shape.begin(), shape.end(),
+                           [](const Dim& d){ return !d.isParam; });
+    };
+    if (!isStatic(fShapeX) || !isStatic(fShapeY)) return "";
+
+    // Check if broadcast is actually needed
+    bool needsBroadcast = (fShapeX.size() != fShapeY.size());
+    if (!needsBroadcast) {
+        needsBroadcast = std::any_of(fShapeX.begin(), fShapeX.end(),
+                          [&](const Dim& d) {
+                              size_t i = &d - fShapeX.data();
+                              return fShapeX[i].dim != fShapeY[i].dim;
+                          });
+    }
+    if (!needsBroadcast) return ""; // same static shape — just a memcpy
+
+    const std::size_t D = fShapeY.size();
+
+    // Left-pad fShapeX with dim=1 entries to match rank of fShapeY
+    std::vector<size_t> shapeX_padded(D, 1);
+    size_t offset = D - fShapeX.size();
+    for (size_t i = 0; i < fShapeX.size(); ++i)
+        shapeX_padded[offset + i] = fShapeX[i].dim;
+
+    std::vector<size_t> shapeY_int(D);
+    for (size_t i = 0; i < D; ++i)
+        shapeY_int[i] = fShapeY[i].dim;
 
+    auto stridesX = UTILITY::ComputeStrideFromShape(shapeX_padded);
+    auto stridesY = UTILITY::ComputeStrideFromShape(shapeY_int);
+    std::size_t totalElements = ConvertShapeToLength(shapeY_int);
+
+    std::string kname = "ExpandKernel_" + opName;
+
+    std::string op;
+    op  = "\n//------ EXPAND_KERNEL_ALPAKA\n";
+    op += SP + "struct " + kname + " {\n";
+    op += SP + SP + "template<typename TAcc, typename T>\n";
+    op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+    op += SP + SP + SP + "TAcc const& acc,\n";
+    op += SP + SP + SP + "T const* __restrict__ input,\n";
+    op += SP + SP + SP + "T* __restrict__ output,\n";
+    op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+    op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+    op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+    op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+    op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+    // Decompose output linear index using compile-time output strides
+    for (std::size_t d = 0; d < D; ++d) {
+        op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+            + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % "
+            + std::to_string(shapeY_int[d]) + "u;\n";
+    }
+    op += "\n";
+
+    // Input index: broadcast dims (shapeX_padded[d]==1) contribute 0 —
+    // compiler eliminates zero terms entirely, no runtime branch
+    op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+    for (std::size_t d = 0; d < D; ++d) {
+        if (shapeX_padded[d] == 1) {
+            op += SP + SP + SP + SP + SP + "0u";
+        } else {
+            op += SP + SP + SP + SP + SP
+                + "out_" + std::to_string(d)
+                + " * " + std::to_string(stridesX[d]) + "u";
+        }
+        op += (d + 1 < D) ? " +\n" : ";\n\n";
+    }
+
+    op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n";
+    op += SP + SP + SP + "}\n";   // end grid-stride loop
+    op += SP + SP + "}\n";        // end operator()
+    op += SP + "};\n";            // end struct
+
+    return op;
+}
+
+std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    if (fInitialized) return "";
+
+    auto isStatic = [](const std::vector<Dim>& shape) {
+        return std::all_of(shape.begin(), shape.end(),
+                           [](const Dim& d){ return !d.isParam; });
+    };
+    if (!isStatic(fShapeX) || !isStatic(fShapeY)) return "";
+
+    // Check if broadcast is actually needed
+    bool needsBroadcast = (fShapeX.size() != fShapeY.size());
+    if (!needsBroadcast) {
+        for (size_t i = 0; i < fShapeX.size(); ++i)
+            if (fShapeX[i].dim != fShapeY[i].dim) { needsBroadcast = true; break; }
+    }
+    if (!needsBroadcast) return "";
+
+    opName = "op_" + opName;
+    std::string kname = "ExpandKernel_" + opName;
+    return SP + kname + " expandKernel_" + opName + ";\n";
+}
+
+std::string Generate_GPU_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    opName = "op_" + opName;
+    if (fShapeY.empty())
+        throw std::runtime_error("TMVA SOFIE Operator Expand called to Generate without being initialized first");
+
+    std::stringstream out;
+    out << "\n//------ EXPAND_GPU_ALPAKA\n";
+
+    if (fInitialized && !fInitBroadcast) {
+        // GenerateInitCode already handled the copy — nothing to do at inference time
+        return "";
+    }
+
+    auto isStatic = [](const std::vector<Dim>& shape) {
+        return std::all_of(shape.begin(), shape.end(),
+                           [](const Dim& d){ return !d.isParam; });
+    };
+    bool staticShapes = isStatic(fShapeX) && isStatic(fShapeY);
+
+    // Check if broadcast is actually needed for static shapes
+    bool needsBroadcast = !staticShapes; // dynamic always needs runtime broadcast
+    if (staticShapes) {
+        needsBroadcast = (fShapeX.size() != fShapeY.size());
+        if (!needsBroadcast) {
+            for (size_t i = 0; i < fShapeX.size(); ++i)
+                if (fShapeX[i].dim != fShapeY[i].dim) { needsBroadcast = true; break; }
+        }
+    }
+
+    if (!needsBroadcast) {
+        // Same static shape — device-to-device copy
+        out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY
+            << ", deviceBuf_" << fNX << ");\n";
+        out << SP << "alpaka::wait(queue);\n";
+        return out.str();
+    }
+
+    if (!staticShapes) {
+        // Dynamic shapes — not yet supported on GPU, throw a clear error
+        throw std::runtime_error(
+            "TMVA SOFIE Expand GPU: dynamic shapes are not yet supported for GPU inference. "
+            "Tensor " + fNX + " has a dynamic shape.");
+    }
+
+    // Static broadcast — launch the expand kernel
+    std::vector<size_t> shapeY_int(fShapeY.size());
+    for (size_t i = 0; i < fShapeY.size(); ++i)
+        shapeY_int[i] = fShapeY[i].dim;
+    std::size_t totalElements = ConvertShapeToLength(shapeY_int);
+    std::string kname = "expandKernel_" + opName;
+
+    out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+    out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+    out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << opName
+        << " = {elementsPerGrid_" << opName << ", elementsPerThread_" << opName << "};\n";
+    out << SP << "auto const workDiv_" << opName << " = alpaka::getValidWorkDiv(kernelCfg_" << opName
+        << ", devAcc, " << kname
+        << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+        << ", static_cast<Idx>(" << totalElements << "));\n";
+    out << SP << "alpaka::exec<Acc>(queue, workDiv_" << opName
+        << ", " << kname
+        << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+        << ", static_cast<Idx>(" << totalElements << "));\n";
+   out << SP <<"alpaka::wait(queue);\n";
+
+    return out.str();
+}
+};
 }//SOFIE
 
 #endif //SOFIE_ROperator_Expand
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx
index bb1a74e..5b553ff 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.hxx
@@ -11,7 +11,6 @@
 #include <string>
 #include <vector>
 
-
 namespace SOFIE {
 
 /*! \brief Gated Recurrent Unit operator
@@ -91,7 +90,7 @@ template <typename T> class ROperator_GRU final : public ROperator {
          fNSequence_lens(UTILITY::Clean_name(nameSequence_lens)),
          fNInitial_h(UTILITY::Clean_name(nameInitial_h)),
          fNY(UTILITY::Clean_name(nameY)), fNY_h(UTILITY::Clean_name(nameY_h)) {
-      
+
       fInputTensorNames = { fNX, fNW, fNR };
       if (!fNB.empty()){
         fInputTensorNames.emplace_back(fNB);
@@ -123,39 +122,34 @@ template <typename T> class ROperator_GRU final : public ROperator {
     *
     * \param input type of the input tensors
     */
-   std::vector<ETensorType> TypeInference(std::vector<ETensorType> /*input*/);
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> /*input*/) override;
 
    /*! \brief Infers the shape of the output tensors
     *
     * \param input shape of the input tensors
     */
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> /*input*/);
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> /*input*/) override;
 
    /*! \brief Initialize the model
     *
     * \param model Model
     */
-   void Initialize(RModel &);
+   void Initialize(RModel &) override;
 
    /*! \brief Generate the inference code
     *
     * \param OpName name of the operator
     */
-   std::string Generate(std::string /*OpName*/);
-
-   /*! \brief Generate the code for the Session internal data vectors
-    *
-    * \param opName name of the operator
-    */
-   std::string GenerateSessionMembersCode(std::string opName);
+   std::string Generate(std::string /*OpName*/) override;
 
    /*! \brief Returns the blas routines needed to compile the generated code
     */
-   std::vector<std::string> GetBlasRoutines() { return { std::string("Gemm"), std::string("Axpy") }; }
+   std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Axpy") }; }
 };
 
 } // namespace SOFIE
 
+
 // Implementation of the ROperator_GRU class
 #include "SOFIE/ROperator_GRU.icc"
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc
index f3813c2..38030d1 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_GRU.icc
@@ -175,51 +175,45 @@ void ROperator_GRU<T>::Initialize(RModel& model){
          fAttrActivations = {"Sigmoid", "Tanh"};
       }
    }
-}
 
-// generate code for Session data members (e.g. internal vectors)
-template <typename T>
-std::string ROperator_GRU<T>::GenerateSessionMembersCode(std::string opName)
-{
-   opName = "op_" + opName;
-   std::stringstream out;
+   // To get unique intermediate tensor names, we add the name of the input
+   // tensor. One might also consider using the index of the operator in the
+   // RMode, but this information is not available in the current scope.
+   std::string opName = "op_gru_" + fNX;
 
    size_t num_directions = fShapeW[0];
    size_t seq_length = (fAttrLayout == 0) ? fShapeX[0] : fShapeX[1];
    size_t batch_size = (fAttrLayout == 0) ? fShapeX[1] : fShapeX[0];
    size_t input_size = fShapeX[2];
 
+   auto declareVector = [&](std::string const &name, std::size_t n){
+      std::string fullName = opName + "_" + name;
+      model.AddIntermediateTensor(fullName, ConvertStringToType(fType), std::vector<std::size_t>{n});
+   };
+
    if (fAttrLayout != 0) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_input = std::vector<" << fType << ">("
-          << seq_length * batch_size * input_size << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_hidden_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
-      out << "std::vector<" << fType << "> fVec_" << opName << "_initial_cell_state = std::vector<" << fType << ">("
-          << num_directions * batch_size * fAttrHiddenSize << ");\n";
+      declareVector("input", seq_length * batch_size * input_size);
+      declareVector("initial_hidden_state", num_directions * batch_size * fAttrHiddenSize);
+      declareVector("initial_cell_state", num_directions * batch_size * fAttrHiddenSize);
    }
    // Set the feedforward
    size_t ff_size = seq_length * batch_size * fAttrHiddenSize;
-   out << "std::vector<" << fType << "> fVec_" << opName << "_f_update_gate = std::vector<" << fType << ">(" << ff_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_f_reset_gate = std::vector<" << fType << ">(" << ff_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_f_hidden_gate = std::vector<" << fType << ">(" << ff_size << ");\n";
+   declareVector("f_update_gate", ff_size);
+   declareVector("f_reset_gate", ff_size);
+   declareVector("f_hidden_gate", ff_size);
    // gate results
    size_t hs_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
-   out << "std::vector<" << fType << "> fVec_" << opName << "_update_gate = std::vector<" << fType << ">(" << hs_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_reset_gate = std::vector<" << fType << ">(" << hs_size << ");\n";
-   out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_gate = std::vector<" << fType << ">(" << hs_size << ");\n";
+   declareVector("update_gate", hs_size);
+   declareVector("reset_gate", hs_size);
+   declareVector("hidden_gate", hs_size);
 
    // feedback
-   out << "std::vector<" << fType << "> fVec_" << opName << "_feedback = std::vector<" << fType << ">("
-       << batch_size * fAttrHiddenSize << ");\n";
+   declareVector("feedback", batch_size * fAttrHiddenSize);
 
    // hiddden state
    if (fAttrLayout != 0 || fNY.empty()) {
-      out << "std::vector<" << fType << "> fVec_" << opName << "_hidden_state = std::vector<" << fType << ">(" << hs_size << ");\n";
+      declareVector("hidden_state", hs_size);
    }
-
-   out << "\n";
-
-   return out.str();
 }
 
 
@@ -234,12 +228,14 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
    size_t input_size = fShapeX[2];
    size_t num_directions = fShapeW[0];
 
+   auto getVec = [&](std::string const &name) { return "tensor_op_gru_" + fNX + "_" + name; };
+
    // set the input
    if (fAttrLayout == 0) {
-      out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n";
+      out << SP << fType << " const* " << OpName << "_input = tensor_" << fNX << ";\n";
    } else {
       if (fUseSession) {
-         out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n";
+         out << SP << fType << " * " << OpName << "_input = " << getVec("input") << ";\n";
       } else {
          out << SP << fType << " " << OpName << "_input[" << seq_length * batch_size * input_size << "];\n";
       }
@@ -261,8 +257,7 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
                 << fNInitial_h << ";\n";
       } else {
          if (fUseSession) {
-            out << SP << fType << " * " << OpName << "_initial_hidden_state = fVec_" << OpName
-                << "_initial_hidden_state.data();\n";
+            out << SP << fType << " * " << OpName << "_initial_hidden_state = " << getVec("initial_hidden_state") << ";\n";
          } else {
             out << SP << fType << " " << OpName << "_initial_hidden_state[" << num_directions * batch_size *
                 fAttrHiddenSize << "];\n";
@@ -283,9 +278,9 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
    // Set the feedforward
    size_t feedforward_size = seq_length * batch_size * fAttrHiddenSize;
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_f_update_gate = fVec_" << OpName << "_f_update_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_f_reset_gate = fVec_" << OpName << "_f_reset_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_f_hidden_gate = fVec_" << OpName << "_f_hidden_gate.data();\n";
+      out << SP << fType << " * " << OpName << "_f_update_gate = " << getVec("f_update_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_f_reset_gate = " << getVec("f_reset_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_f_hidden_gate = " << getVec("f_hidden_gate") << ";\n";
    } else {
       out << SP << fType << " " << OpName << "_f_update_gate[" << feedforward_size << "] = {0};\n";
       out << SP << fType << " " << OpName << "_f_reset_gate[" << feedforward_size << "] = {0};\n";
@@ -294,9 +289,9 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
    // Set the gates
    size_t hidden_state_size = seq_length * num_directions * batch_size * fAttrHiddenSize;
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_update_gate = fVec_" << OpName << "_update_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_reset_gate = fVec_" << OpName << "_reset_gate.data();\n";
-      out << SP << fType << " * " << OpName << "_hidden_gate = fVec_" << OpName << "_hidden_gate.data();\n";
+      out << SP << fType << " * " << OpName << "_update_gate = " << getVec("update_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_reset_gate = " << getVec("reset_gate") << ";\n";
+      out << SP << fType << " * " << OpName << "_hidden_gate = " << getVec("hidden_gate") << ";\n";
    } else {
       out << SP << fType << " " << OpName << "_update_gate[" << hidden_state_size << "] = {0};\n";
       out << SP << fType << " " << OpName << "_reset_gate[" << hidden_state_size << "] = {0};\n";
@@ -307,14 +302,14 @@ auto ROperator_GRU<T>::Generate(std::string OpName)
       out << SP << fType << " *" << OpName << "_hidden_state = tensor_" << fNY << ";\n";
    } else {
       if (fUseSession) {
-         out << SP << fType << " * " << OpName << "_hidden_state = fVec_" << OpName << "_hidden_state.data();\n";
+         out << SP << fType << " * " << OpName << "_hidden_state = " << getVec("hidden_state") << ";\n";
       } else {
          out << SP << fType << " " << OpName << "_hidden_state[" << hidden_state_size << "] = {0};\n";
       }
    }
 
    if (fUseSession) {
-      out << SP << fType << " * " << OpName << "_feedback = fVec_" << OpName << "_feedback.data();\n";
+      out << SP << fType << " * " << OpName << "_feedback = " << getVec("feedback") << ";\n";
    } else {
       out << SP << fType << " " << OpName << "_feedback[" << batch_size * fAttrHiddenSize << "] = {0};\n";
    }
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx
index 4d34846..a56b012 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Gather.hxx
@@ -22,9 +22,9 @@ private:
    std::string fNIndices;
    std::string fNY;
 
-   std::vector<size_t> fShapeX;
-   std::vector<size_t> fShapeIndices;
-   std::vector<size_t> fShapeY;
+   std::vector<Dim> fShapeX;
+   std::vector<Dim> fShapeIndices;
+   std::vector<Dim> fShapeY;
 
    std::vector<int64_t> fIndices;  // indices vector in case they are known at initialization
 
@@ -51,8 +51,12 @@ public:
       if (!model.CheckIfTensorAlreadyExist(fNX)) {
          throw std::runtime_error("TMVA SOFIE Gather Op Input Tensor " + fNX + " is not found in model");
       }
-      fShapeX = model.GetTensorShape(fNX);
-      fShapeIndices = model.GetTensorShape(fNIndices);
+      fShapeX = model.GetDimTensorShape(fNX);
+      if (model.Verbose())
+         std::cout << "Gather - initial shape " << ConvertDimShapeToString(fShapeX) << " shape of indices "
+               << ConvertDimShapeToString(model.GetDimTensorShape(fNIndices)) << std::endl;
+      //  fShapeIndices can be  dynamic
+      fShapeIndices = model.GetDimTensorShape(fNIndices);
       size_t q = fShapeIndices.size();
       // Axis in range [0, r) where r=rank(X)
       size_t r = fShapeX.size();
@@ -60,18 +64,20 @@ public:
       if (fAttrAxis < 0) {
          fAttrAxis = fAttrAxis + int64_t(r);
       }
-      // empty fShapeIndices is a scalar value for the indices
-      size_t indicesLength = ConvertShapeToLength(fShapeIndices);
+
 
       // case indices tensor is initialized
       if (model.IsInitializedTensor(fNIndices)) {
+          // empty shape Indices is a scalar value for the indices
+         size_t indicesLength = ConvertShapeToLength(model.GetTensorShape(fNIndices));
          int64_t* indicesData = static_cast<int64_t*>(model.GetInitializedTensorData(fNIndices).get());
-         //flag index tensor as not writable (not sure this is needed since index tensor might be used in generated code)
-         model.SetNotWritableInitializedTensor(fNIndices);
          // update indices data in case of negative dim values
          for (size_t i = 0; i < indicesLength; i++) {
-            if (indicesData[i] < 0) {
-               indicesData[i] += fShapeX[fAttrAxis];
+            // move this at generation time?
+            if (!fShapeX[fAttrAxis].isParam) {
+               if (indicesData[i] < 0) {
+                  indicesData[i] += fShapeX[fAttrAxis].dim;
+               }
             }
          }
          // Save in a vector gather Indices of size q
@@ -79,65 +85,91 @@ public:
       }
       // Output shape
       if (model.Verbose())
-         std::cout << "Gather: q and r " << q << " " << r << " shape indices " << ConvertShapeToString(fShapeIndices) << std::endl;
+         std::cout << "Gather: q and r " << q << " " << r << " shape indices " << ConvertDimShapeToString(fShapeIndices) << std::endl;
 
       if (fShapeY.empty()) {
          fShapeY.resize(q + r - 1);
          if (fAttrAxis > 0) {
-            // Copy shape of X[0, ..., axis) to Shape of Y[0, ..., axis)
+            // Copy shape of X[0, ..., axis-1) to Shape of Y[0, ..., axis-1)
             std::copy(fShapeX.begin(), fShapeX.begin() + fAttrAxis, fShapeY.begin());
          }
          // Set shape of Y[axis, ..., axis + q)
          for (size_t i = 0; i < q; i++) {
-            fShapeY[fAttrAxis + i] = fShapeIndices[i];
+            fShapeY[fAttrAxis + i] = Dim{ fShapeIndices[i]};
          }
-         // Copy shape of X[axis + 1, ..., axis + r) to shape of Y[axis + q, ... q + r - 1)
+         // Copy shape of X[axis + 1, ..., r) to shape of Y[axis + q, ... q + r - 1)
          std::copy(fShapeX.begin() + fAttrAxis + 1, fShapeX.end(), fShapeY.begin() + fAttrAxis + q);
       }
       // case input is known (type is an integer) and input indices is a scalar (or vector of size 1)
       if (model.IsInitializedTensor(fNX) && q <= 1 && r == 1 && fIndices.size() > 0) {
+         auto shapeX = ConvertShapeToInt(fShapeX);  // we assume model is not dynamic
+         auto shapeY = ConvertShapeToInt(fShapeY);
          if (model.GetTensorType(fNX) == ETensorType::INT64) {
             auto inputData = static_cast<int64_t*>(model.GetInitializedTensorData(fNX).get());
             // if q <=1 and r = 1 output length = 1 (it is a scalar)
-            std::vector<int64_t> outputData(ConvertShapeToLength(fShapeY));
+            std::vector<int64_t> outputData(1); //ConvertShapeToLength(shapeY));
             outputData[0] = inputData[fIndices[0]];
-            model.AddConstantTensor(fNY, fShapeY, outputData.data());
+            model.AddConstantTensor(fNY, shapeY, outputData.data());
             if (model.Verbose())
-               std::cout << "Gather: " << fNX << " " << ConvertShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY)
+               std::cout << "Gather: " << fNX << " " << ConvertShapeToString(shapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(shapeY)
                    << " and values " << ConvertValuesToString(outputData) << " (constant) " << std::endl;
             fIsOutputConstant = true;
          }
       }
+      // case input is a shape tensor  (r is == 1 by definition) and indices are known
+      else if (model.IsShapeTensor(fNX) && q <=1  && fIndices.size() > 0) {
+         auto inputData = model.GetShapeTensorValues(fNX);
+         // if r == 1 and q<=1 then output length is 1 (is a scalar or tensor of size1)
+         std::vector<Dim> outputData(1);
+         outputData[0] = inputData[fIndices[0]];
+         if (outputData[0].isParam) {
+            fIsOutputConstant = true;
+            // shapeY can be scalar or vector of size1
+            model.AddShapeTensor(fNY, outputData, fShapeY.size() == 0);
+            if (model.Verbose())
+               std::cout << "Gather: " << fNX << " " << ConvertDimShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY)
+                   << " and values " << ConvertDimShapeToString(outputData) << " (shape) " << std::endl;
+         } else {
+            int64_t value = static_cast<int64_t>(outputData[0].dim);
+            auto shapeY = ConvertShapeToInt(fShapeY);
+            model.AddConstantTensor(fNY, shapeY, &value);
+            fIsOutputConstant = true;
+            if (model.Verbose())
+               std::cout << "Gather: " << fNX << " " << ConvertDimShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY)
+                   << " and values {" << value <<  "} (constant) " << std::endl;
+         }
+      }
       if (!fIsOutputConstant) {
          // Add output tensor
          model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
          fType = ConvertTypeToString(model.GetTensorType(fNX));
          if (model.Verbose())
-               std::cout <<  "Gather: " << fNX << " " << ConvertShapeToString(fShapeX) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY)
-                  << std::endl;
+               std::cout <<  "Gather: input " << fNX << " " << ConvertDimShapeToString(fShapeX) << " indices " << fNIndices << ConvertDimShapeToString(fShapeIndices)
+                         << " -> " << fNY << " with shape " << ConvertDimShapeToString(fShapeY) << std::endl;
       }
    }
 
-   std::string Generate(std::string OpName) override {
+   std::string Generate(std::string opName) override {
+      opName = "op_" + opName;
+      std::stringstream out;
+      out << "//--------- Gather " << opName << " --> " << fNY << "  " << ConvertDimShapeToString(fShapeY) << "\n";
       if (fIsOutputConstant) {
          // no code to generate here for constant output. Tensor output is defined in Session constructor
-         return "//---------------------------------------\n";
+         out << "//--------------------(constant)----------\n";
+         return out.str();
       }
-      OpName = "op_" + OpName;
-      std::stringstream out;
-      out << "//--------- Gather operator \n";
       // The shape of the output is q + r - 1
       size_t r = fShapeX.size();
       // Indices of shape q
       size_t q = fShapeIndices.size();
       // Strides
-      std::vector<size_t> stridesX = UTILITY::ComputeStrideFromShape(fShapeX);
-      std::vector<size_t> stridesY = UTILITY::ComputeStrideFromShape(fShapeY);
-      std::vector<size_t> stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices);
+      auto stridesX = UTILITY::ComputeStrideFromShape(fShapeX);
+      auto stridesY = UTILITY::ComputeStrideFromShape(fShapeY);
+      auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices);
 
       // case fIndices is not known we need to correct for negative axis indices at run-time
       if (fIndices.empty()) {
-         size_t indicesLength = ConvertShapeToLength(fShapeIndices);
+         auto indicesLength = ConvertDimShapeToLength(fShapeIndices);
          out << SP << "// correct in case of negative gather indices\n";
          out << SP << "for (size_t i = 0; i < " << indicesLength << "; i++){\n";
          out << SP << SP << "if (tensor_" << fNIndices << "[i] < 0)\n";
@@ -145,73 +177,230 @@ public:
          out << SP << "}\n";
       }
 
-
       // Fill the output Y[j_0, j_1, ..., j_{axis - 1}, i_0, i_1, ..., i_{q - 1}, j_{axis + 1}, ..., j_{r - 1}]
       // [0 ... axis) [axis ... axis + q) [axis + q ... q + r - 1)
       // iterate in [0 ... axis) [0 ... q) [axis ... r - 1)
       // for j_0, j_1, ..., j_{axis-1}
+
       for (size_t j = 0; j < size_t(fAttrAxis); j++) {
          std::string index = "j_" + std::to_string(j);
-         out << SP << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[j] << "; " << index << "++) {\n";
+         for (size_t k = 0; k <= j; k++) out << SP;
+         out << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[j] << "; " << index << "++) {\n";
       }
       // for i_0, i_1, ..., i_{q - 1}
-      if (q == 0)
-         out << SP << SP << "{\n";  // add a scope for local variables
       for (size_t i = 0; i < q; i++) {
          std::string index = "i_" + std::to_string(i);
-         out << SP << SP << "for (size_t " << index << " = " << 0 << "; " << index << " < " << fShapeIndices[i] << "; " << index << "++) {\n";
+         for (size_t k = 0; k <= i + fAttrAxis; k++) out << SP;
+         out << "for (size_t " << index << " = " << 0 << "; " << index << " < " << fShapeIndices[i] << "; " << index << "++) {\n";
       }
       // for j_axis, j_{axis + 1}, ..., j_{r - 1}
       for (size_t j = fAttrAxis; j + 1 < r; j++) {
-         std::string index = "j_" + std::to_string(j);
-         out << SP << SP << SP << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[q + j] << "; " << index << "++) {\n";
+         std::string index = "j_" + std::to_string(q+j); // annotate index using output axis
+         for (size_t k = 0; k <= q + j; k++) out << SP;
+         out << "for (size_t " << index << " = 0; " << index << " < " << fShapeY[q + j] << "; " << index << "++) {\n";
       }
 
-      out << SP << SP << SP << "size_t y_index = 0;\n";
+      // add a scope for local variables in case above loop are not done
+      if (fAttrAxis == 0 && q == 0 && r <= 1)
+         out << SP << "{   // scalar case \n";
+
+      // output index
+      for (size_t k = 0; k < q + r; k++) out << SP;
+      out << "size_t y_index = ";
       for (size_t j = 0; j < size_t(fAttrAxis); j++) {
-         out << SP << SP << SP << "y_index += j_" + std::to_string(j) + " * " << stridesY[j] << ";\n";
+         if (j > 0) out << " + ";
+         out << "j_" << j;
+         if (stridesY[j].dim != 1) out << " * " << stridesY[j];
       }
       for (size_t i = 0; i < q; i++) {
-         out << SP << SP << SP << "y_index += i_" + std::to_string(i) + " * " << stridesY[fAttrAxis + i] << ";\n";
+         if (fAttrAxis + i > 0) out << " + ";
+         out << "i_" << i;
+         if (stridesY[fAttrAxis + i].dim != 1) out << " * " << stridesY[fAttrAxis + i];
       }
       for (size_t j = fAttrAxis; j + 1 < r; j++) {
-         out << SP << SP << SP << "y_index += j_" + std::to_string(j) + " * " << stridesY[q + j] << ";\n";
+         if (j + q > 0) out << " + ";
+         out << "j_" << q+j;
+         if (stridesY[q+j].dim != 1) out << " * " << stridesY[q+j];
       }
-      // Indices
-      out << SP << SP << SP << "size_t i_index = 0;\n";
+      // empty case
+      if (fAttrAxis == 0 && q == 0 && r <= 1)
+         out << "0";
+      out << ";\n";
+
+      // input Indices
+      for (size_t k = 0; k < q + r; k++) out << SP;
+      out << "size_t i_index = ";
       for (size_t i = 0; i < q; i++) {
-         out << SP << SP << SP << "i_index += i_" + std::to_string(i) + " * " << stridesIndices[i] << ";\n";
+         if (i > 0) out << " + ";
+         out << "i_" << i;
+         if (stridesIndices[i].dim != 1) out << " * " << stridesIndices[i];
       }
+      // empty case
+      if (q == 0)
+         out << "0";
+      out << ";\n";
+
       // K
-      out << SP << SP << SP << "size_t k = static_cast<size_t>(" << "tensor_" << fNIndices << "[i_index]" << ");\n";
+      for (size_t k = 0; k < q + r; k++) out << SP;
+      out << "size_t k = static_cast<size_t>(" << "tensor_" << fNIndices << "[i_index]" << ");\n";
       // Input
-      out << SP << SP << SP << "size_t x_index = k * " << stridesX[fAttrAxis] << ";\n";
+      for (size_t k = 0; k < q + r; k++) out << SP;
+      out << "size_t x_index = k";
+      if (stridesX[fAttrAxis].dim != 1) out << " * " << stridesX[fAttrAxis];
       for (size_t j = 0; j < size_t(fAttrAxis); j++) {
-         out << SP << SP << SP << "x_index += j_" + std::to_string(j) + " * " << stridesX[j] << ";\n";
+         out << " + ";
+         out << " j_" << j;
+         if (stridesX[j].dim != 1) out << " * " << stridesX[j];
       }
-      for (size_t j = fAttrAxis + 1; j < r; j++) {
-         out << SP << SP << SP << "x_index += j_" + std::to_string(j - 1) + " * " << stridesX[j] << ";\n";
+      // for input corresponding stride is axis+1,.... r
+      // loop is on j from fAttrAxis, so consider stridesX[j+1]
+      for (size_t j = fAttrAxis; j+1 < r; j++) {
+         out << " + ";
+         out << " j_" << q+j;
+         if (stridesX[j+1].dim != 1) out << " * " << stridesX[j+1];
       }
-      out << SP << SP << SP << "tensor_" << fNY << "[y_index] = tensor_" << fNX << "[x_index];\n";
+      out << ";\n";
+      for (size_t k = 0; k < q + r; k++) out << SP;
+      out << "tensor_" << fNY << "[y_index] = tensor_" << fNX << "[x_index];\n";
 
       // end loops j_k, j_{k + 1}, ..., j_{r - 2}
-      for (size_t j = fAttrAxis; j + 1 < r; j++) {
-         out << SP << SP << SP << "}\n";
-      }
-      // end loops i_0, i_1, ..., i_{q - 1}
-      if (q == 0)
-         out << SP << SP << "}\n";  // end of scope for q = 0
-      for (size_t i = 0; i < q; i++) {
-         out << SP << SP << "}\n";
-      }
-      // end loops j_0, j_1, ..., j_{axis - 1}
-      for (size_t j = 0; j < size_t(fAttrAxis); j++) {
-         out << SP << "}\n";
+      for (size_t j = q+r-1; j > 0; j--) {
+         for (size_t k = 0; k <j; k++) out << SP;
+         out << "}\n";
       }
+      // close empty scope if it was opened
+      if (q == 0 && fAttrAxis == 0 && r <= 1)
+         out << SP << "}   // close Gather scope for scalar case \n";
+
 
       return out.str();
    }
 
+std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    opName = "op_" + opName;
+    if (fShapeY.empty())
+        throw std::runtime_error("TMVA SOFIE Gather Op called to Generate without being initialized first");
+
+    const std::size_t D  = fShapeY.size();   // output rank = q + r - 1
+    const std::size_t r  = fShapeX.size();
+    const std::size_t q  = fShapeIndices.size();
+
+    auto stridesY       = UTILITY::ComputeStrideFromShape(fShapeY);
+    auto stridesX       = UTILITY::ComputeStrideFromShape(fShapeX);
+    auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices);
+
+    std::string kname = "GatherKernel_" + opName;
+
+    std::string op;
+    op  = "\n//------ GATHER_KERNEL_ALPAKA\n";
+    op += SP + "struct " + kname + " {\n";
+    op += SP + SP + "template<typename TAcc, typename T>\n";
+    op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+    op += SP + SP + SP + "TAcc const& acc,\n";
+    op += SP + SP + SP + "T const* __restrict__ input,\n";
+    op += SP + SP + SP + "int64_t const* __restrict__ indices,\n";
+    op += SP + SP + SP + "T* __restrict__ output,\n";
+    op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+    op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+    op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+    op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+    op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+    for (std::size_t d = 0; d < D; ++d) {
+        op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+            + " = (elem_idx / " + stridesY[d].GetVal() + "u) % "
+            + fShapeY[d].GetVal() + "u;\n";
+    }
+    op += "\n";
+
+    // Output dims [axis ... axis+q) correspond to the indices tensor dims [0 ... q)
+    // so i_index = sum over i in [0,q): out_{axis+i} * stridesIndices[i]
+    if (q == 0) {
+        op += SP + SP + SP + SP + "std::size_t const i_index = 0u;\n";
+    } else {
+        op += SP + SP + SP + SP + "std::size_t const i_index =\n";
+        for (std::size_t i = 0; i < q; ++i) {
+            op += SP + SP + SP + SP + SP
+                + "out_" + std::to_string(fAttrAxis + i)
+                + " * " + stridesIndices[i].GetVal() + "u";
+            op += (i + 1 < q) ? " +\n" : ";\n";
+        }
+    }
+    op += "\n";
+
+    op += SP + SP + SP + SP + "int64_t k = indices[i_index];\n";
+    op += SP + SP + SP + SP + "if (k < 0) k += " + fShapeX[fAttrAxis].GetVal() + ";\n";
+    op += SP + SP + SP + SP + "if (k < 0) k = 0;\n";
+    op += SP + SP + SP + SP + "if (k >= static_cast<int64_t>(" + fShapeX[fAttrAxis].GetVal() + ")) "
+        + "k = static_cast<int64_t>(" + fShapeX[fAttrAxis].GetVal() + ") - 1;\n\n";
+
+    // x_index = k * stridesX[axis]
+    //         + sum over j in [0, axis):   out_j          * stridesX[j]
+    //         + sum over j in [axis+1, r): out_{j-1+q}    * stridesX[j]
+    // (the dims after axis in Y are shifted by q-1 relative to X)
+    op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+    op += SP + SP + SP + SP + SP + "static_cast<std::size_t>(k) * " + stridesX[fAttrAxis].GetVal() + "u";
+    for (std::size_t j = 0; j < static_cast<std::size_t>(fAttrAxis); ++j) {
+        op += " +\n" + SP + SP + SP + SP + SP
+            + "out_" + std::to_string(j) + " * " + stridesX[j].GetVal() + "u";
+    }
+    for (std::size_t j = fAttrAxis + 1; j < r; ++j) {
+        // in Y, the coord for X's dim j lives at output dim q + j - 1
+        op += " +\n" + SP + SP + SP + SP + SP
+            + "out_" + std::to_string(q + j - 1) + " * " + stridesX[j].GetVal() + "u";
+    }
+    op += ";\n\n";
+
+    op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n";
+    op += SP + SP + SP + "}\n";
+    op += SP + SP + "}\n";
+    op += SP + "};\n";
+
+    return op;
+}
+
+std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    opName = "op_" + opName;
+    std::string kname = "GatherKernel_" + opName;
+    return SP + kname + " gatherKernel_" + opName + ";\n";
+}
+
+std::string Generate_GPU_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+    opName = "op_" + opName;
+    if (fShapeY.empty())
+        throw std::runtime_error("TMVA SOFIE Gather Op called to Generate without being initialized first");
+
+    auto totalElements = ConvertDimShapeToLength(fShapeY);
+    std::string kname = "gatherKernel_" + opName;
+
+    std::stringstream out;
+    out << "\n//------ GATHER_GPU_ALPAKA\n";
+    out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+    out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+    out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << opName
+        << " = {elementsPerGrid_" << opName << ", elementsPerThread_" << opName << "};\n";
+    out << SP << "auto const workDiv_" << opName << " = alpaka::getValidWorkDiv(kernelCfg_" << opName
+        << ", devAcc, " << kname
+        << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+        << ", static_cast<Idx>(" << totalElements << "));\n";
+    out << SP << "alpaka::exec<Acc>(queue, workDiv_" << opName
+        << ", " << kname
+        << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+        << ", static_cast<Idx>(" << totalElements << "));\n";
+    out << SP <<"alpaka::wait(queue);\n";
+
+    return out.str();
+}
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_GatherND.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_GatherND.hxx
new file mode 100644
index 0000000..3fa45fa
--- /dev/null
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_GatherND.hxx
@@ -0,0 +1,304 @@
+#ifndef SOFIE_ROPERATOR_GATHERND
+#define SOFIE_ROPERATOR_GATHERND
+
+#include "SOFIE/SOFIE_common.hxx"
+#include "SOFIE/ROperator.hxx"
+#include "SOFIE/RModel.hxx"
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <numeric>
+
+namespace SOFIE {
+
+class ROperator_GatherND final : public ROperator
+{
+private:
+
+   int64_t fBatchDims = 0;
+
+   std::string fNData;
+   std::string fNIndices;
+   std::string fNY;
+
+   std::vector<size_t> fShapeData;
+   std::vector<size_t> fShapeIndices;
+   std::vector<size_t> fShapeY;
+
+   std::string fType;
+
+public:
+   ROperator_GatherND() {}
+   ROperator_GatherND(int64_t batchDims,
+                      std::string nameData,
+                      std::string nameIndices,
+                      std::string nameY)
+      : fBatchDims(batchDims),
+        fNData(UTILITY::Clean_name(nameData)),
+        fNIndices(UTILITY::Clean_name(nameIndices)),
+        fNY(UTILITY::Clean_name(nameY))
+   {
+      fInputTensorNames  = { fNData, fNIndices };
+      fOutputTensorNames = { fNY };
+   }
+
+   std::vector<ETensorType> TypeInference(std::vector<ETensorType> input) override {
+      return { input[0] };
+   }
+
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      return { input[0] };
+   }
+
+   void Initialize(RModel& model) override {
+      if (!model.CheckIfTensorAlreadyExist(fNData))
+         throw std::runtime_error("TMVA SOFIE GatherND: data tensor " + fNData + " not found in model");
+      if (!model.CheckIfTensorAlreadyExist(fNIndices))
+         throw std::runtime_error("TMVA SOFIE GatherND: indices tensor " + fNIndices + " not found in model");
+
+      fShapeData    = model.GetTensorShape(fNData);
+      fShapeIndices = model.GetTensorShape(fNIndices);
+
+      size_t r = fShapeData.size();
+      size_t q = fShapeIndices.size();
+      size_t b = static_cast<size_t>(fBatchDims);
+      size_t last_idx_dim = fShapeIndices.back();
+
+      if (r < 1)
+         throw std::runtime_error("TMVA SOFIE GatherND: data rank must be >= 1");
+      if (q < 1)
+         throw std::runtime_error("TMVA SOFIE GatherND: indices rank must be >= 1");
+      if (b >= std::min(q, r))
+         throw std::runtime_error("TMVA SOFIE GatherND: batch_dims must be < min(q, r)");
+      if (last_idx_dim > r - b)
+         throw std::runtime_error("TMVA SOFIE GatherND: indices_shape[-1] must be <= r - batch_dims");
+
+      for (size_t i = 0; i < b; ++i) {
+         if (fShapeData[i] != fShapeIndices[i])
+            throw std::runtime_error("TMVA SOFIE GatherND: first batch_dims dimensions of data and indices must match");
+      }
+
+      // Output shape: batch_dims + indices[0..q-2] + data[b + last_idx_dim .. r-1]
+      // rank = b + (q - b - 1) + (r - b - last_idx_dim)
+      //      = q + r - last_idx_dim - 1 - b
+      fShapeY.clear();
+      for (size_t i = 0; i < b; ++i)
+         fShapeY.push_back(fShapeData[i]);
+      for (size_t i = b; i + 1 < q; ++i)
+         fShapeY.push_back(fShapeIndices[i]);
+      for (size_t i = b + last_idx_dim; i < r; ++i)
+         fShapeY.push_back(fShapeData[i]);
+
+      model.AddIntermediateTensor(fNY, model.GetTensorType(fNData), fShapeY);
+      fType = ConvertTypeToString(model.GetTensorType(fNData));
+
+      if (model.Verbose())
+         std::cout << "GatherND: data " << ConvertShapeToString(fShapeData)
+                   << " indices " << ConvertShapeToString(fShapeIndices)
+                   << " batch_dims=" << fBatchDims
+                   << " -> " << fNY << " " << ConvertShapeToString(fShapeY) << std::endl;
+   }
+
+   std::string Generate(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeY.empty())
+         throw std::runtime_error("TMVA SOFIE GatherND called to Generate without being initialized first");
+
+      size_t r = fShapeData.size();
+      size_t q = fShapeIndices.size();
+      size_t b = static_cast<size_t>(fBatchDims);
+      size_t last_idx_dim = fShapeIndices.back();
+
+      auto stridesData    = UTILITY::ComputeStrideFromShape(fShapeData);
+      auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices);
+      auto stridesY       = UTILITY::ComputeStrideFromShape(fShapeY);
+
+      size_t totalOutput = ConvertShapeToLength(fShapeY);
+
+      std::stringstream out;
+      out << SP << "//--------- GatherND operator " << opName << "\n";
+
+      out << SP << "for (size_t out_idx = 0; out_idx < " << totalOutput << "; out_idx++) {\n";
+
+      out << SP << SP << "size_t rem = out_idx;\n";
+      size_t Dy = fShapeY.size();
+      for (size_t d = 0; d < Dy; ++d) {
+         out << SP << SP << "size_t oy_" << d << " = rem / " << stridesY[d] << ";\n";
+         out << SP << SP << "rem %= " << stridesY[d] << ";\n";
+      }
+
+      out << SP << SP << "size_t idx_base = 0;\n";
+      for (size_t i = 0; i < b; ++i)
+         out << SP << SP << "idx_base += oy_" << i << " * " << stridesIndices[i] << ";\n";
+      for (size_t i = b; i + 1 < q; ++i)
+         out << SP << SP << "idx_base += oy_" << i << " * " << stridesIndices[i] << ";\n";
+
+      out << SP << SP << "size_t data_idx = 0;\n";
+      for (size_t i = 0; i < b; ++i)
+         out << SP << SP << "data_idx += oy_" << i << " * " << stridesData[i] << ";\n";
+
+      out << SP << SP << "for (size_t k = 0; k < " << last_idx_dim << "; k++) {\n";
+      out << SP << SP << SP << "int64_t idx_val = tensor_" << fNIndices
+          << "[idx_base + k * " << stridesIndices[q - 1] << "];\n";
+      out << SP << SP << SP << "if (idx_val < 0) idx_val += " << "static_cast<int64_t>(tensor_"
+          << fNData << "_shape[" << b << " + k]);\n";
+      out << SP << SP << SP << "data_idx += static_cast<size_t>(idx_val) * " << "data_stride_b_plus_k_" << opName << "[k];\n";
+      out << SP << SP << "}\n";
+
+      // Accumulate trailing data dims from output coords
+      // Y dims [b + (q-b-1) .. ] correspond to data dims [b + last_idx_dim .. r-1]
+      size_t y_trailing_start = b + (q - b - 1);
+      for (size_t i = b + last_idx_dim; i < r; ++i) {
+         size_t oy_dim = y_trailing_start + (i - (b + last_idx_dim));
+         out << SP << SP << "data_idx += oy_" << oy_dim << " * " << stridesData[i] << ";\n";
+      }
+
+      out << SP << SP << "tensor_" << fNY << "[out_idx] = tensor_" << fNData << "[data_idx];\n";
+      out << SP << "}\n";
+
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeY.empty())
+         throw std::runtime_error("TMVA SOFIE GatherND called to Generate without being initialized first");
+
+      size_t r = fShapeData.size();
+      size_t q = fShapeIndices.size();
+      size_t b = static_cast<size_t>(fBatchDims);
+      size_t last_idx_dim = fShapeIndices.back();
+
+      auto stridesData    = UTILITY::ComputeStrideFromShape(fShapeData);
+      auto stridesIndices = UTILITY::ComputeStrideFromShape(fShapeIndices);
+      auto stridesY       = UTILITY::ComputeStrideFromShape(fShapeY);
+
+      size_t Dy = fShapeY.size();
+      size_t totalOutput = ConvertShapeToLength(fShapeY);
+
+      std::string kname = "GatherNDKernel_" + opName;
+
+      std::string op;
+      op  = "\n//------ GATHERND_KERNEL_ALPAKA\n";
+      op += SP + "struct " + kname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ data,\n";
+      op += SP + SP + SP + "int64_t const* __restrict__ indices,\n";
+      op += SP + SP + SP + "T* __restrict__ output,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      for (size_t d = 0; d < Dy; ++d) {
+         op += SP + SP + SP + SP + "std::size_t const oy_" + std::to_string(d)
+             + " = (elem_idx / " + std::to_string(stridesY[d]) + "u) % "
+             + std::to_string(fShapeY[d]) + "u;\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "std::size_t const idx_base =\n";
+      // batch dims: oy_0..oy_{b-1} * stridesIndices[0..b-1]
+      // outer idx dims: oy_b..oy_{b+(q-b-2)} * stridesIndices[b..q-2]
+      bool first = true;
+      for (size_t i = 0; i < q - 1; ++i) {
+         op += SP + SP + SP + SP + SP
+             + (first ? "" : "+ ")
+             + "oy_" + std::to_string(i) + " * " + std::to_string(stridesIndices[i]) + "u\n";
+         first = false;
+      }
+      if (first) op += SP + SP + SP + SP + SP + "0u\n"; // q==1: scalar index tuple
+      op += SP + SP + SP + SP + SP + ";\n\n";
+
+      op += SP + SP + SP + SP + "std::size_t data_idx =\n";
+      first = true;
+      for (size_t i = 0; i < b; ++i) {
+         op += SP + SP + SP + SP + SP
+             + (first ? "" : "+ ")
+             + "oy_" + std::to_string(i) + " * " + std::to_string(stridesData[i]) + "u\n";
+         first = false;
+      }
+      if (first) op += SP + SP + SP + SP + SP + "0u\n";
+      op += SP + SP + SP + SP + SP + ";\n\n";
+
+      op += SP + SP + SP + SP + "// Read " + std::to_string(last_idx_dim) + "-element index tuple\n";
+      for (size_t k = 0; k < last_idx_dim; ++k) {
+         size_t idx_offset = k;
+         size_t data_axis  = b + k;
+         op += SP + SP + SP + SP + "{\n";
+         op += SP + SP + SP + SP + SP
+             + "int64_t idx_val = indices[idx_base + "
+             + std::to_string(idx_offset) + "u];\n";
+         op += SP + SP + SP + SP + SP
+             + "if (idx_val < 0) idx_val += "
+             + std::to_string(fShapeData[data_axis]) + ";\n";
+         op += SP + SP + SP + SP + SP
+             + "data_idx += static_cast<std::size_t>(idx_val) * "
+             + std::to_string(stridesData[data_axis]) + "u;\n";
+         op += SP + SP + SP + SP + "}\n";
+      }
+      op += "\n";
+
+      size_t y_trailing_start = b + (q - b - 1);
+      for (size_t i = b + last_idx_dim; i < r; ++i) {
+         size_t oy_dim = y_trailing_start + (i - (b + last_idx_dim));
+         op += SP + SP + SP + SP
+             + "data_idx += oy_" + std::to_string(oy_dim)
+             + " * " + std::to_string(stridesData[i]) + "u;\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "output[elem_idx] = data[data_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      std::string kname = "GatherNDKernel_" + opName;
+      return SP + kname + " gatherNDKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeY.empty())
+         throw std::runtime_error("TMVA SOFIE GatherND called to Generate without being initialized first");
+
+      std::size_t totalElements = ConvertShapeToLength(fShapeY);
+      std::string kname = "gatherNDKernel_" + opName;
+
+      std::stringstream out;
+      out << "\n//------ GATHERND_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << opName
+          << " = {elementsPerGrid_" << opName << ", elementsPerThread_" << opName << "};\n";
+      out << SP << "auto const workDiv_" << opName << " = alpaka::getValidWorkDiv(kernelCfg_" << opName
+          << ", devAcc, " << kname
+          << ", alpaka::getPtrNative(deviceBuf_" << fNData << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", static_cast<Idx>(" << totalElements << "));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << opName
+          << ", " << kname
+          << ", alpaka::getPtrNative(deviceBuf_" << fNData << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNIndices << ")"
+          << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+          << ", static_cast<Idx>(" << totalElements << "));\n";
+      out << SP <<"alpaka::wait(queue);\n";
+      return out.str();
+   }
+};
+
+} // SOFIE
+
+#endif // SOFIE_ROPERATOR_GATHERND
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx
index 046bf56..47efe01 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Gemm.hxx
@@ -23,6 +23,7 @@ namespace SOFIE{
 
    private:
       bool fIsDynamic = false;
+      bool fBroadcastBias = false;
 
       float fAttrAlpha = 1.0;
       float fAttrBeta = 1.0;
@@ -32,7 +33,6 @@ namespace SOFIE{
       std::string fNA;
       std::string fNB;
       std::string fNC = "";
-      std::string fNC2; // bias tensor name after broadcasting
       std::string fNY;
       std::string fType;
       EActivationType fActivation;
@@ -40,6 +40,7 @@ namespace SOFIE{
       std::vector<Dim> fShapeB;
       std::vector<size_t> fShapeC;
       std::vector<Dim> fShapeY;
+      RModel * fModel = nullptr;
 
    public:
 
@@ -48,6 +49,7 @@ namespace SOFIE{
          fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)),
          fNB(UTILITY::Clean_name(nameB)), fNY(UTILITY::Clean_name(nameY))
       {
+         fKind = OperatorKind::GEMM;
          fActivation = activation;
          fType = "float";
          static_assert(std::is_same_v<T, float>,
@@ -60,9 +62,11 @@ namespace SOFIE{
          fAttrAlpha(alpha), fAttrBeta(beta), fAttrTransA(transA), fAttrTransB(transB), fNA(UTILITY::Clean_name(nameA)),
          fNB(UTILITY::Clean_name(nameB)), fNC(UTILITY::Clean_name(nameC)), fNY(UTILITY::Clean_name(nameY)), fActivation(activation)
       {
+         fKind = OperatorKind::GEMM;
          fActivation = activation;
          fType = "float";
 
+         fInputTensorNames = {fNA, fNB, fNC};
          fOutputTensorNames = { fNY };
       }
 
@@ -72,7 +76,7 @@ namespace SOFIE{
       }
 
       template <typename U>
-      std::vector<std::vector<U>> DoShapeInference(const std::vector<std::vector<U>> & input){
+      std::vector<U> DoShapeInference(const std::vector<std::vector<U>> & input){
          if (input.size() > 3) throw std::runtime_error("TMVA SOFIE Gemm Op Shape Inference only need 2 or 3 input tensor");
          // accept tensor with input dimensions > 2
          // example: A = (d1,d2,...,N1,N2)  B = (d1,d2,...,N2,N3)    --> Y = (d1,d2,..,N1,N3)
@@ -82,11 +86,10 @@ namespace SOFIE{
             }
          }
 
-         std::vector<std::vector<U>> ret;
          // when there are 3 inputs shape of Y is the one of C
          if (input.size() == 3){
-            ret.push_back(input[2]);   //shape of C is shape of Y
-            return ret;
+            //shape of C is shape of Y
+            return input[2];
          }
          // ioffset cannot be less than 2
          int ioffset = input[0].size()-2;  // in case of tensors with dim > 2
@@ -105,6 +108,7 @@ namespace SOFIE{
          if (input[0].size() > 2 && input[1].size() == input[0].size()) {
             // in case of dim > 2 first dimensions are equal to the input ones not
             // equal to 1 (e.g. (1,2,3) * (2,3,4) -> (2,2,4))
+            // here could probably use the Broadcasting function  UTILITY::MultidirectionalBroadcastShape
             for (size_t i = 0; i < input[0].size()-2; i++) {
                Dim valueA = input[0][i];
                Dim valueB = input[1][i];
@@ -113,24 +117,41 @@ namespace SOFIE{
                      s_y.push_back(input[0][i]);
                   else if (valueA.GetVal() == "1")
                      s_y.push_back(input[1][i]);
+                  else if (!valueA.isParam && !valueB.isParam)
+                     throw std::runtime_error("TMVA SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and "
+                        + valueB.GetVal());
+                  else if (valueA.isParam && valueB.isParam){
+                      // check which parameter is first in RModel list
+                     auto & dimNames = fModel->GetDimShapeNames();
+                     auto p1 = std::find(dimNames.begin(), dimNames.end(), valueA.param);
+                     auto p2 = std::find(dimNames.begin(), dimNames.end(), valueB.param);
+                     if (p1 < p2) s_y.push_back(input[0][i]);
+                     else  s_y.push_back(input[1][i]);
+                  }
+                  else if (!valueA.isParam)
+                     s_y.push_back(input[0][i]);
+                  else if (!valueB.isParam)
+                     s_y.push_back(input[1][i]);
                   else
                      throw std::runtime_error("TMVA SOFIE Gemm Op - invalid input shapes " + valueA.GetVal() + " and "
                         + valueB.GetVal());
                }
-               s_y.push_back(input[0][i]);
+               else
+                  s_y.push_back(input[0][i]);
             }
          }
 
          s_y.push_back(s_a[0]);
          s_y.push_back(s_b[1]);
-         ret.push_back(s_y);
-         return ret;
+         return s_y;
       }
 
       std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-         return DoShapeInference<size_t>(input);
+         std::vector<std::vector<size_t>> ret;
+         ret.push_back(DoShapeInference<size_t>(input));
+         return ret;
       }
-      std::vector<std::vector<Dim>> DynamicShapeInference(const std::vector<std::vector<Dim>> & input){
+      std::vector<Dim> DynamicShapeInference(const std::vector<std::vector<Dim>> & input){
          return DoShapeInference<Dim>(input);
       }
 
@@ -138,6 +159,7 @@ namespace SOFIE{
 
       void Initialize(RModel& model) override {
          //TODO: propagate A or B as specified by ONNX standard
+         fModel = &model;
 
          if ((model.CheckIfTensorAlreadyExist(fNA) == false) || (model.CheckIfTensorAlreadyExist(fNB) == false) ){   //input must be a graph input, or already initialized intermediate tensor
             throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor " + fNA + " or " + fNB + " is not found in model");
@@ -186,14 +208,8 @@ namespace SOFIE{
             }
          }
 
-         fShapeY = DynamicShapeInference({fShapeA, fShapeB})[0];
-         std::vector<size_t> shapeY;
-         if (!fIsDynamic) {
-            shapeY = ConvertShapeToInt(fShapeY);
-            if (shapeY.empty()) {
-               throw std::runtime_error("TMVA SOFIE Gemm Op " + fNY + " has invalid shape" + ConvertDynamicShapeToString(fShapeY));
-            }
-         }
+         fShapeY = DynamicShapeInference({fShapeA, fShapeB});
+         std::vector<size_t> shapeY = ConvertShapeToInt(fShapeY);
 
          // bias is normally not dynamic (not support it for time being)
          if (fNC != ""){
@@ -202,38 +218,27 @@ namespace SOFIE{
                throw std::runtime_error("TMVA SOFIE Gemm Op Input Tensor" + fNC + " is dynamic and is not supported");
             }
             fShapeC = model.GetTensorShape(fNC);
-            fNC2 = fNC;
-            size_t lengthC = ConvertShapeToLength(fShapeC);
-            size_t lengthY = ConvertShapeToLength(shapeY);
-            // for dynamic outputs broadcasting is always done
-            bool broadcast_needed = lengthC != lengthY;
+            // for dynamic outputs broadcasting is always needed
+            bool broadcast_needed = false;
+            if (fIsDynamic && shapeY.empty())
+               broadcast_needed = true;
+            else
+               // consider broadcasting also if same length
+               broadcast_needed = (fShapeC != shapeY);
 
 
             if (broadcast_needed) {
-               if (!model.UseSession()) {
-                  // without session dynamic tensors not supported in Gemm
-                  if (fIsDynamic) {
-                      throw std::runtime_error("TMVA SOFIE Gemm Op:  dynamic tensors not supported without a session");
-                  }
-                  auto original_data = model.GetInitializedTensorData(fNC);
-                  auto targetShape = UTILITY::UnidirectionalBroadcastShape(fShapeC, shapeY);
-                  if (fType == "float") {
-                     std::shared_ptr<void> new_data_ptr(UTILITY::UnidirectionalBroadcast<float>(
-                        static_cast<float *>(original_data.get()), fShapeC, targetShape),
-                        std::default_delete<float[]>());
-
-                     model.UpdateInitializedTensor(fNC, model.GetTensorType(fNC), shapeY, new_data_ptr);
-                     fShapeC = shapeY;
-                  }
-               } else {
-                  // In case of session add broadcasting code in Session constructor and in GenerateInitCode
-                  // we need to add a new intermediate tensor for broadcasted bias tensor
-                  fNC2 = fNC + "bcast";
-                  if (!fIsDynamic) {
-                     model.AddIntermediateTensor(fNC2, model.GetTensorType(fNC), shapeY);
-                  }
-                  else
-                     model.AddDynamicTensor(fNC2,model.GetTensorType(fNC), fShapeY);
+               fBroadcastBias = true;
+               // check if broadcasting is compatible and note that prepend 1 to shapeC
+               auto shapeDimC = ConvertShapeToDim(fShapeC);
+               auto r = UTILITY::MultidirectionalBroadcastShape(fShapeY, shapeDimC);
+               // return flag must be equal to 1 since this is a unidirectional broadcast of C->Y
+               if (r.first > 1) {
+                  throw std::runtime_error("TMVA SOFIE Gemm Op - bias tensor of shape " + ConvertShapeToString(fShapeC) + " cannot be uni-directional broadcasted to " + ConvertDimShapeToString(fShapeY));
+               }
+               fShapeC = ConvertShapeToInt(shapeDimC);
+               if (fShapeC.empty()) {
+                  throw std::runtime_error("TMVA SOFIE Gemm Op - Error in bias tensor " + ConvertDimShapeToString(shapeDimC) );
                }
             }
          }
@@ -260,7 +265,7 @@ namespace SOFIE{
          if (model.Verbose()){
             std::cout << "Gemm (or MatMul) " << " ---> " << fNY << " shape ";
             if (fIsDynamic)
-               std::cout << ConvertDynamicShapeToString(fShapeY) << std::endl;
+               std::cout << ConvertDimShapeToString(fShapeY) << std::endl;
             else
                std::cout << ConvertShapeToString(shapeY) << std::endl;
          }
@@ -268,35 +273,167 @@ namespace SOFIE{
          model.AddNeededStdLib("algorithm");
       }
 
-      std::string GenerateInitCode() override {
+      std::string Generate(std::string opName) override {
+         opName = "op_" + opName;
+
+         if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) {
+            throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first");
+         }
          std::stringstream out;
-         // generate initialization code for broadcasting of bias tensor
-         if (fShapeC.size() != fShapeY.size() && fNC != fNC2) {
-            // we broadcast here always C in Y output, so target shape is the one of Y
-            // no need to call UTILITY::UnidirectionalBroadcastShape.
-            // here in case of parametric shape we need to assume that the parameters will be defined in the initialization code.
-            auto targetShape = fShapeY;
-            // include a separate scope to avoid defining unique operator temp variables
-            out << "//--- broadcast bias tensor " << fNC << "for Gemm op\n";
-            out << SP << "{\n";
-            out << "      float * data = SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_"
-               << fNC << "," << ConvertShapeToString(fShapeC) << ", " << ConvertDynamicShapeToString(fShapeY) << ");\n";
-            auto length = SOFIE::ConvertDynamicShapeToLength(fShapeY); // output size
-            out << SP << SP << "std::copy(data, data + " << length << ", tensor_" << fNC2 << ");\n";
-            out << SP << SP << "delete [] data;\n";
+         out << "\n//--------- Gemm " << opName << " " << ConvertDimShapeToString(fShapeA) << " * " << ConvertDimShapeToString(fShapeB)
+             << " -> " << ConvertDimShapeToString(fShapeY) << "\n";
+         // need to consider case A and B have dim > 2 (for MatMul)
+         int64_t dimA = fShapeA.size();
+         int64_t dimB = fShapeB.size();
+         int64_t dimY = fShapeY.size();
+         if (dimA != dimB || dimA != dimY) {
+             throw std::runtime_error("TMVA SOFIE Gemm(MatMul) has invalid shape for inputs or output");
+         }
+         auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal());
+         auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal());
+         auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal());
+         // size of A: if (transposeA) is m*k else k*m
+         // size of B  n*k
+         std::vector<Dim> sY = {fShapeY[dimY-2], fShapeY[dimY-1]};
+         // extra dimensions in case of stacked MatMul
+         std::vector<Dim> sExtraY;
+         for (int64_t i = 0; i < dimY-2; i++) {
+            sExtraY.push_back(fShapeY[i]);
+         }
+         auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation
+         auto lengthExtra_Y = ConvertDimShapeToLength(sExtraY); // extra length in case input tensors are of dim>2 (MatMul)
+
+         // case bias is present
+         if (!fNC.empty()){
+            if (!fBroadcastBias) {
+               // add a check in case broadcasting was not needed or done outside of session
+               // C should have smaller dimension of Y
+               if (!fIsDynamic) {
+                  if (std::stoi(lengthGemm) != static_cast<int>(ConvertShapeToLength(fShapeC)))
+                     throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor has not correct size "
+                            + ConvertShapeToString(fShapeC) + " output length " + lengthGemm);
+               } else {
+                  // add a dynamic check (C should not be a dynamic tensor)
+                  out << SP << "assert(" << lengthGemm << " == " <<  ConvertShapeToLength(fShapeC) << ");\n";
+               }
+            }
+         } else {
+            //in this case fAttrBeta needs to be equal to zero otherwise second time we run we will use
+            // the previous result
+            if (fAttrBeta != 0) {
+               throw std::runtime_error("TMVA SOFIE Gemm Op " + opName + " Bias tensor is not present but beta value in Gemm is not zero");
+            }
+         }
+
+         // include MatMul case where we stack the Gemm operations
+         // exclude case where we have only 1's in the additional dims
+         bool doStackMul = dimY > 2 && ( fIsDynamic  || std::stoi(lengthExtra_Y) > 1);
+         // compute input offset for stack multiplications
+         std::string lengthExtra_A;
+         std::string lengthExtra_B;
+         std::string increment_A;
+         std::string increment_B;
+
+         if (doStackMul) {
+            std::vector<Dim> sA(fShapeA.begin(), fShapeA.begin()+dimA-2);
+            std::vector<Dim> sB(fShapeB.begin(), fShapeB.begin()+dimB-2);
+            std::vector<Dim> mA = {fShapeA[dimA-2], fShapeA[dimA-1]};
+            std::vector<Dim> mB = {fShapeA[dimB-2], fShapeB[dimB-1]};
+            lengthExtra_A = ConvertDimShapeToLength(sA);
+            lengthExtra_B = ConvertDimShapeToLength(sB);
+            // size of A performing matmul is m*k and n*k for B
+            increment_A = ConvertDimShapeToLength(mA);
+            increment_B = ConvertDimShapeToLength(mB);
+         }
+         bool extraA = (doStackMul && lengthExtra_A != "1");
+         bool extraB = (doStackMul && lengthExtra_B != "1");
+         if (doStackMul) {
+            out << SP << "size_t " << opName << "_y_offset = 0;\n"; // needed if we stack the gemm operations
+            if (extraA)
+               out << SP << "size_t " << opName << "_A_offset = 0;\n";
+            if (extraB)
+               out << SP << "size_t " << opName << "_B_offset = 0;\n";
+            out << SP << "for (size_t i = 0; i < " << lengthExtra_Y << "; i++){\n";
+            out << SP;
+         }
+         // do the bias broadcasting
+         if (fBroadcastBias) {
+            fAttrBeta = 1.;
+            out << SP << "for (size_t j = 0; j < " << sY[0] << "; j++) { \n";
+            out << SP << SP << "size_t y_index = ";
+            if (doStackMul) // add offset in caseof stack multiplications (not sure if bias is present in these cases)
+               out <<  opName << "_y_offset + ";
+            if (sY[1].GetVal() != "1")
+               out << sY[1] << " * j;\n";
+            else
+               out << "j;\n";
+
+            out << SP << SP << "for (size_t k = 0; k < " << sY[1] << "; k++) { \n";
+            std::string bias_index;
+            if (fShapeC[0] == 1 && fShapeC[1] == sY[1].dim)
+               bias_index = "k";
+            else if (fShapeC[1] == 1 && fShapeC[0] == sY[0].dim)
+               bias_index = "j";
+            else if (fShapeC[0] == 1 && fShapeC[1] == 1)   // scalar case
+               bias_index = "0";
+            else {
+               throw std::runtime_error("TMVA SOFIE Gemm Op - invalid shape for bias tensor " + ConvertShapeToString(fShapeC));
+            }
+
+            out << SP << SP << SP << "tensor_" << fNY << "[y_index + k] = " <<  "tensor_" << fNC << "[" << bias_index << "];\n";
+            out << SP << SP << "}\n";
             out << SP << "}\n";
          }
+
+         if (fType == "float"){
+
+            out << SP << "TMVA::Experimental::SOFIE::Gemm_Call("
+             << "tensor_" << fNY;
+             if (doStackMul) out << " + " << opName << "_y_offset";
+            out <<   ", "
+             << (fAttrTransB ? "true, " : "false, ")
+             << (fAttrTransA ? "true, " : "false, ")
+             << n << ", " << m << ", " << k << ", ";
+            out << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ", tensor_" << fNB;
+            if (extraB) out << " + " << opName << "_B_offset";
+            out << ", tensor_" << fNA;
+            if (extraA) out << " + " << opName << "_A_offset";
+            out << ", " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ",";
+            // in the case of bias and no broadcasting needed
+            if (!fNC.empty() && !fBroadcastBias)
+               out << "tensor_" << fNC;
+            else
+               out << "nullptr";
+            out << ");\n";
+
+            if(fActivation == EActivationType::RELU){
+               out << SP << "for (int id = 0; id < " << ConvertDimShapeToLength(fShapeY) << " ; id++){\n";
+               out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNY << "[id] > 0 )? tensor_" << fNY << "[id] : 0);\n";
+               out << SP << "}\n";
+            }
+         }
+
+         if (doStackMul) {
+            out << SP << SP <<  opName << "_y_offset += " << lengthGemm << ";\n";
+            if (lengthExtra_A != "1")
+               out << SP << SP << opName << "_A_offset += " << increment_A << ";\n";
+            if (lengthExtra_B != "1")
+               out << SP << SP << opName << "_B_offset += " << increment_B << ";\n";
+
+            out << "}\n"; // end of loop on the stacked multiplications
+         }
+
          return out.str();
       }
 
-      std::string Generate(std::string opName) override {
+      std::string Generate_GPU_ALPAKA(std::string opName) override {
          opName = "op_" + opName;
 
          if (fShapeA.empty() || fShapeB.empty() || fShapeY.empty() || (fNC != "" && fShapeC.empty())) {
             throw std::runtime_error("TMVA SOFIE Gemm Op called to Generate without being initialized first");
          }
          std::stringstream out;
-         out << "\n//--------- Gemm\n";
+         out << "\n//--------- Gemm_GPU_ALPAKA\n";
          out << SP << "char " << opName << "_transA = " << (fAttrTransA ? "\'t\'" : "\'n\'") << ";\n";
          out << SP << "char " << opName << "_transB = " << (fAttrTransB ? "\'t\'" : "\'n\'") << ";\n";
          // need to consider case A and B have dim > 2 (for MatMul)
@@ -315,20 +452,20 @@ namespace SOFIE{
          for (int64_t i = 0; i < dimY-2; i++) {
             sA.push_back(fShapeY[i]);
          }
-         auto lengthGemm = ConvertDynamicShapeToLength(sY); // size of the Gemm operation
-         auto lengthExtra = ConvertDynamicShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul)
+         auto lengthGemm = ConvertDimShapeToLength(sY); // size of the Gemm operation
+         auto lengthExtra = ConvertDimShapeToLength(sA); // extra length in case input tensors are of dim>2 (MatMul)
 
          out << SP << "int " << opName << "_m = " << m << ";\n";
          out << SP << "int " << opName << "_n = " << n << ";\n";
          out << SP << "int " << opName << "_k = " << k << ";\n";
          out << SP << "float " << opName << "_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrAlpha << ";\n";
-         out << SP << "float " << opName << "_beta = " << std::setprecision(std::numeric_limits<float>::max_digits10) << fAttrBeta << ";\n";
-         out << SP << "int " << opName << "_lda = " << (fAttrTransA ? m : k) << ";\n";
-         out << SP << "int " << opName << "_ldb = " << (fAttrTransB ? k : n) << ";\n";
+         
+         // restricting to a 0 beta since BIAS is configured separately through sofieBLAS interface
+         out << SP << "float " << opName << "_beta = 0;\n";
 
          // case bias is present
          if (!fNC.empty()){
-            if (fNC2 == fNC) {
+            if (!fBroadcastBias) {
                // add a check in case broadcasting was not needed or done outside of session
                // C should have smaller dimension of Y
                if (!fIsDynamic) {
@@ -358,39 +495,39 @@ namespace SOFIE{
          }
          // in the case of bias
          if (!fNC.empty()){
-            out << SP << "std::copy(" << "tensor_" << fNC2 << ", " << "tensor_" << fNC2 << " + " << lengthGemm << ", "
-               << "tensor_" << fNY;
-            if (doStackMul) out << " + " << opName << "_yoffset";
-            out << ");\n";
-         }
-
-
-         if (fType == "float"){
-
-            out << SP << "BLAS::sgemm_(&" << opName << "_transB, &" << opName << "_transA, &" << opName
-             << "_n, &" << opName << "_m, &" << opName << "_k, &" << opName << "_alpha, " << "tensor_" << fNB
-             << ", &" << opName << "_ldb, " << "tensor_" << fNA << ", &" << opName << "_lda, &" << opName << "_beta, "
-             << "tensor_" << fNY;
-             if (doStackMul) out << " + " << opName << "_yoffset";
-             out << ", &" << opName << "_n);\n";
-
-            if(fActivation == EActivationType::RELU){
-               out << SP << "for (int id = 0; id < " << SOFIE::ConvertDynamicShapeToLength(fShapeY) << " ; id++){\n";
-               out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNY << "[id] > 0 )? tensor_" << fNY << "[id] : 0);\n";
-               out << SP << "}\n";
+            if (fActivation == EActivationType::RELU){
+               out << SP << "blas.gemmrelu("<<opName<<"_transB, "<<opName<<"_transA, "<<opName<<"_n, "<<opName<<"_m, "<<opName<<"_k, "<< opName << "_alpha, deviceBuf_"<<fNB<<",  "<<"deviceBuf_"<<fNA<<", "<<opName << "_beta, deviceBuf_"<<fNC<<", deviceBuf_"<<fNY<<");\n";
+            } else {
+               out << SP << "blas.gemm("<<opName<<"_transB, "<<opName<<"_transA, "<<opName<<"_n, "<<opName<<"_m, "<<opName<<"_k, "<< opName << "_alpha, deviceBuf_"<<fNB<<",  "<<"deviceBuf_"<<fNA<<", "<<opName << "_beta, deviceBuf_"<<fNC<<", deviceBuf_"<<fNY<<");\n";
             }
          }
-
-         if (doStackMul) {
-            out << SP << SP <<  opName << "_yoffset += " << lengthGemm << ";\n";
-            out << "}\n"; // end of loop on the stacked multiplications
-         }
+         // need to implement for matmul case without bias
 
          return out.str();
       }
 
       std::vector<std::string> GetBlasRoutines() override { return { std::string("Gemm"), std::string("Gemv") }; }
+      std::string GetFusableOutputTensorName() override {
+         return fNY;
+      }
+
+      void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+         removal_func(fNY);
+         fNY = fusable_tensor_name;
+         fOutputTensorNames[0] = fNY;
+      }
 
+      std::string GetBlasConfig(){
+         int64_t dimA = fShapeA.size();
+         int64_t dimB = fShapeB.size();
+         auto m = (fAttrTransA ? fShapeA[dimA-1].GetVal() : fShapeA[dimA-2].GetVal());
+         auto n = (fAttrTransB ? fShapeB[dimB-2].GetVal() : fShapeB[dimB-1].GetVal());
+         auto k = (fAttrTransA ? fShapeA[dimA-2].GetVal() : fShapeA[dimA-1].GetVal());
+         auto lda = (fAttrTransA ? m : k);
+         auto ldb = (fAttrTransB ? k : n);
+         auto ldc = n;
+         return n+", "+m+", "+k+", "+ldb+", "+lda+", "+ldc;
+      }
    };
 
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc b/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc
index bec7760..ebf4daf 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_LSTM.icc
@@ -1,7 +1,6 @@
 #ifndef SOFIE_ROPERATOR_LSTM_I
 #define SOFIE_ROPERATOR_LSTM_I
 
-
 namespace SOFIE {
 
 template<typename T>
@@ -291,7 +290,7 @@ auto ROperator_LSTM<T>::Generate(std::string OpName)
 
    // set the input
    if (fAttrLayout == 0) {
-      out << SP << fType << " *" << OpName << "_input = tensor_" << fNX << ";\n";
+      out << SP << fType << " const *" << OpName << "_input = tensor_" << fNX << ";\n";
    } else {
       if (fUseSession)
          out << SP << fType << " * " << OpName << "_input = fVec_" << OpName << "_input.data();\n";
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx
index 17b77b3..12ea5b7 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_LayerNormalization.hxx
@@ -3,16 +3,15 @@
 
 #include "SOFIE/RModel.hxx"
 #include "SOFIE/SOFIE_common.hxx"
-
 #include <sstream>
 #include <string>
 
-
 namespace SOFIE {
 
 template <typename T>
 class ROperator_LayerNormalization : public ROperator {
 private:
+   bool fCastToFloat = false;  // flag to indicate if operation 1 are in floats (to be  impl)
    int fAttrAxis;
    float fAttrEpsilon;
    size_t fAttrStashType;
@@ -30,7 +29,7 @@ private:
 
    std::vector<Dim> fShapeX;
    std::vector<Dim> fShapeScale;
-   std::vector<size_t> fShapeB;  // shape of input Bias (B) is assumed to be fully defined
+   std::vector<Dim> fShapeB;
    std::vector<Dim> fShapeY;
    std::vector<Dim> fShapeMean;
    std::vector<Dim> fShapeInvStdDev;
@@ -39,8 +38,8 @@ private:
    size_t fSize; // Size of the input
    // size_t fAxisDim;
 
-   std::vector<Dim> fNormalizedShape;
-   std::vector<Dim> fAxesShape;
+   std::vector<Dim> fNormalizedShape;  // shape from X[ axis,...,N-1]
+   std::vector<Dim> fAxesShape;        // shape from X[0,..,axis-1]
    // lengths in string format
    std::string fLength; // Length of the input
    std::string fNormalizedLength;
@@ -78,10 +77,10 @@ public:
 
    void Initialize(RModel& model) override {
       if (!model.CheckIfTensorAlreadyExist(fNX)) {
-         throw std::runtime_error("TMVA::SOFIE - Tensor " + fNX + " not found.");
+         throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Tensor " + fNX + " not found.");
       }
       bool isDynamic = model.IsDynamicTensor(fNX);
-      fShapeX = model.GetDynamicTensorShape(fNX);
+      fShapeX = model.GetDimTensorShape(fNX);
       fShapeY = fShapeX;
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShapeY);
       // Type of the output
@@ -93,18 +92,17 @@ public:
       // Shape of fShapeX[0, ..., fAxis)
       fAxesShape = std::vector<Dim>(fShapeX.begin(), fShapeX.begin() + fAxis);
       // Length of the axes
-      fAxesLength = ConvertDynamicShapeToLength(fAxesShape);
+      fAxesLength = ConvertDimShapeToLength(fAxesShape);
       // Shape of fShapeX[fAxis, ..., fSize)
       fNormalizedShape = std::vector<Dim>(fShapeX.begin() + fAxis, fShapeX.end());
       // Length of the normalized axis
-      fNormalizedLength = ConvertDynamicShapeToLength(fNormalizedShape);
+      fNormalizedLength = ConvertDimShapeToLength(fNormalizedShape);
       // length of the input
-      fLength = ConvertDynamicShapeToLength(fShapeX);
+      fLength = ConvertDimShapeToLength(fShapeX);
       // Type of mean and std
       ETensorType type = (fAttrStashType == 1) ? ETensorType::FLOAT : model.GetTensorType(fNX);
       // Mean
-      if (fNMean.empty()) {
-         fNMean = "Mean" + fNX;
+      if (!fNMean.empty()) {
          // cannot use initializer list with one element since it is ambiguous
          if (isDynamic)
             // add size_t(-1) to indicate that shape is an expression
@@ -113,29 +111,60 @@ public:
             model.AddIntermediateTensor(fNMean, type, std::vector<size_t>(1,std::stoi(fAxesLength)));
       }
       // Inverse Standard Deviation
-      if (fNInvStdDev.empty()) {
-         fNInvStdDev = "InvStdDev" + fNX;
+      if (!fNInvStdDev.empty()) {
          if (isDynamic)
             model.AddIntermediateTensor(fNInvStdDev, type, std::vector<Dim>(1,Dim{fAxesLength,std::size_t(-1)}));
          else
             model.AddIntermediateTensor(fNInvStdDev, type, std::vector<size_t>(1,std::stoi(fAxesLength)));
       }
+      // if mean and stdev are not empty they are not defined in the output list
       // Cast X to float
       if (fAttrStashType == 1 && model.GetTensorType(fNX) != ETensorType::FLOAT) {
-         fNCastedX = "Casted" + fNX;
-         model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX);
-         fNNormalizedX = "Normalized" + fNX;
-         model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX);
+         fCastToFloat = true;
+         fType = "float";
+         // fNCastedX = "Casted" + fNX;
+         // model.AddIntermediateTensor(fNCastedX, ETensorType::FLOAT, fShapeX);
+         // fNNormalizedX = "Normalized" + fNX;
+         // model.AddIntermediateTensor(fNNormalizedX, ETensorType::FLOAT, fShapeX);
+      }
+      // scale shape
+      fShapeScale = model.GetDimTensorShape(fNScale);
+      // appends 1 to scale shapes if missing
+      size_t dimScale = fShapeScale.size();
+      if (dimScale < fSize) {
+         for (size_t i = 0; i < fSize-dimScale; i++)
+            fShapeScale.insert(fShapeScale.begin(), Dim{1});
+      }
+      // check also shape if consistent now
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeScale[i].dim != 1 && fShapeScale[i] != fShapeX[i])
+            throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Scale Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale));
       }
-      // Broadcast the bias
       if (!fNB.empty()) {
-         fShapeB = model.GetTensorShape(fNB);
-         size_t lengthB = ConvertShapeToLength(fShapeB);
-         if (isDynamic || lengthB < static_cast<size_t>(std::stoi(fLength))) {
-            fNBroadcastedB = "Broadcasted" + fNB;
-            model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX);
+         fShapeB = model.GetDimTensorShape(fNB);
+         // appends 1 to bias shapes if missing
+         size_t dimB = fShapeB.size();
+         if (dimB < fShapeX.size()) {
+            for (size_t i = 0; i < fSize-dimB; i++)
+               fShapeB.insert(fShapeB.begin(), Dim{1});
+         }
+         for (size_t i = 0; i < fSize; i++) {
+            if (fShapeB[i].dim != 1 && fShapeB[i] != fShapeX[i])
+               throw std::runtime_error("TMVA::SOFIE - LayerNormalization - Bias Tensor has invalid shape " + ConvertDimShapeToString(fShapeScale));
          }
       }
+
+      std::cout << "bias + scale " << ConvertDimShapeToString(fShapeB) << "  " << ConvertDimShapeToString(fShapeScale) << std::endl;
+
+      // // Broadcast the bias
+      // if (!fNB.empty()) {
+      //    fShapeB = model.GetTensorShape(fNB);
+      //    size_t lengthB = ConvertShapeToLength(fShapeB);
+      //    if (isDynamic || lengthB < static_cast<size_t>(std::stoi(fLength))) {
+      //       fNBroadcastedB = "Broadcasted" + fNB;
+      //       model.AddIntermediateTensor(fNBroadcastedB, ConvertStringToType(fType), fShapeX);
+      //    }
+      // }
       model.AddNeededStdLib("cmath");
    }
 
@@ -145,8 +174,8 @@ public:
       if (!fNBroadcastedB.empty()) {
          out << SP << "// Broadcasting the bias of LayerNormalization op\n";
          out << SP << "{\n";
-         out << SP << SP << "float* data = SOFIE::UTILITY::UnidirectionalBroadcast<float>(tensor_";
-         out << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertDynamicShapeToString(fShapeX) << ");\n";
+         out << SP << SP << "float* data = TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_";
+         out << fNB << ", " << ConvertDimShapeToString(fShapeB) << ", " << ConvertDimShapeToString(fShapeX) << ");\n";
          out << SP << "std::copy(data, data + " << fLength << ", tensor_" << fNBroadcastedB << ");\n";
          out << SP << "delete[] data;\n";
          out << SP << "}\n";
@@ -161,10 +190,6 @@ public:
          throw std::runtime_error("TMVA::SOFIE LayerNormalization operator " + opName +
                                   " called to generate without being initialized first.");
       }
-      if (fShapeX.size() > 5) {
-         throw std::runtime_error("TMVA::SOFIE LayerNormalization operator not "
-                                  "implemented for input tensor of size > 5.");
-      }
 
       std::stringstream out;
 
@@ -178,10 +203,32 @@ public:
       }
 
       auto strides = UTILITY::ComputeStrideFromShape(fShapeX);
-      std::string InputIndex = "axis_0 * " + strides[0].GetVal();
+      std::string inputIndex = "axis_0 * " + strides[0].GetVal();
       for (size_t i = 1; i < fSize; i++) {
-         InputIndex += " + axis_" + std::to_string(i) + " * " + strides[i].GetVal();
+         inputIndex += " + axis_" + std::to_string(i);
+         if (i < fSize-1) inputIndex += " * " + strides[i].GetVal();
+      }
+      auto scaleStrides = UTILITY::ComputeStrideFromShape(fShapeScale);
+      std::string scaleIndex;
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeScale[i].dim != 1) {
+            if (!scaleIndex.empty()) scaleIndex += " + ";
+            scaleIndex += "axis_" + std::to_string(i);
+            if ( scaleStrides[i].dim != 1) scaleIndex +=  " * " + scaleStrides[i].GetVal();
+         }
+      }
+      if (scaleIndex.empty()) scaleIndex = "0";
+
+      auto biasStrides = UTILITY::ComputeStrideFromShape(fShapeB);
+      std::string biasIndex;
+      for (size_t i = 0; i < fSize; i++) {
+         if (fShapeB[i].dim != 1) {
+            if (!biasIndex.empty()) biasIndex += " + ";
+            biasIndex += "axis_" + std::to_string(i);
+            if ( biasStrides[i].dim != 1) biasIndex +=  " * " + biasStrides[i].GetVal();
+         }
       }
+      if (biasIndex.empty()) biasIndex = "0";
 
       auto axesStrides = UTILITY::ComputeStrideFromShape(fAxesShape);
       std::string axesIndex = "axis_" + std::to_string(0) + " * " + axesStrides[0].GetVal();
@@ -189,51 +236,33 @@ public:
          axesIndex += " + axis_" + std::to_string(i) + " * " + axesStrides[i].GetVal();
       }
 
-      auto normalizedStrides = UTILITY::ComputeStrideFromShape(fNormalizedShape);
-      std::string normalizedIndex = "axis_" + std::to_string(fAxis) + " * " + normalizedStrides[0].GetVal();
-      for (size_t i = fAxis + 1; i < fSize; i++) {
-         normalizedIndex += " + axis_" + std::to_string(i) + " * " + normalizedStrides[i - fAxis].GetVal();
-      }
 
-      if (!fNCastedX.empty()) {
-         // Cast X to float
-         out << SP << "for (size_t i = 0; i < " << fLength << "; i++) {\n";
-         out << SP << SP << "tensor_" << fNCastedX << "[i] = " << "static_cast<float>(tensor_" << fNX;
-         out << "[i]);\n";
-         out << SP << "}\n";
-      }
+      // compute mean and std-dev. Save in tensors if requested
 
       out << SP << "// Compute the mean\n";
-      // Loop over the normalized dimensions
+
+      // Loop over all the outer dims in [0, fAxis)
       for (size_t i = 0; i < fAxis; i++) {
          std::string iIdx = "axis_" + std::to_string(i);
          out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
                       << "; " << iIdx << "++) {\n";
       }
-      out << SP << SP << fType << " sum = 0.;\n";
-      // loop over all the dims in [0, fAxis)
+      out << SP << SP << fType << " mean = 0.;\n";
+      // loop over the normalized dimensions (fAxis,....,N-1)
       for (size_t j = fAxis; j < fSize; j++) {
          std::string jIdx = "axis_" + std::to_string(j);
          out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
                          << "; " << jIdx << "++) {\n";
       }
-      out << SP << SP << SP << "sum += tensor_" << fNX << "[" << InputIndex << "];\n";
+      out << SP << SP << SP << "mean += tensor_" << fNX << "[" << inputIndex << "];\n";
       for (size_t j = fAxis; j < fSize; j++) {
          out << SP << SP << "}\n";
       }
-      out << SP << SP << "tensor_" << fNMean << "[" << axesIndex << "] = sum / " << fType << "(";
-      out << fNormalizedLength << ");\n";
-      for (size_t i = fAxis; i < fSize; i++) {
-         out << SP << "}\n";
-      }
+      out << SP << SP << "mean  /= " << fType << "(" << fNormalizedLength << ");\n";
+
 
       out << SP << "// Compute the inverse Standard Deviation\n";
-      // Loop over the normalized dimensions
-      for (size_t i = 0; i < fAxis; i++) {
-         std::string iIdx = "axis_" + std::to_string(i);
-         out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                   << "; " << iIdx << "++){\n";
-      }
+
       // Set sum = 0
       out << SP << SP << fType << " sum = 0.;\n";
       // loop over all the dims in [0, fAxis)
@@ -242,91 +271,46 @@ public:
          out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
                           << "; " << jIdx << "++){\n";
       }
-      out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << InputIndex << "] - tensor_"
-                            << fNMean << "[" << axesIndex << "];\n";
+      out << SP << SP << SP << "float tmp = tensor_" << fNX << "[" << inputIndex << "] - mean;\n";
       out << SP << SP << SP << "sum += tmp*tmp;\n";
       for (size_t j = fAxis; j < fSize; j++) {
          out << SP << SP << "}\n";
       }
-      out << SP << SP << "tensor_" << fNInvStdDev << "[" << axesIndex << "] = 1 / std::sqrt(";
+      out << SP << SP << fType << " invStdDev = 1 / std::sqrt(";
       out << "sum / " << fType << "(" << fNormalizedLength << ") + " << fAttrEpsilon << ");\n";
-      for (size_t i = 0; i < fAxis; i++) {
-         out << SP << "}\n";
-      }
 
-      if (!fNCastedX.empty()) {
-         out << "// NormalizedX = InvStdDev * (CastedX - Mean)\n";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                          << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                             << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNNormalizedX << "[" << InputIndex << "] = tensor_";
-         out << fNInvStdDev << "[" << axesIndex << "] * (tensor_" << fNCastedX << "[" << InputIndex;
-         out << "] - tensor_" << fNMean << "[" << axesIndex << "])\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
-         out << "// Y = Scale o NormalizedX";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                      << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                            << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale;
-         out << "[" << axesIndex << "] * static_cast<" << fType << ">(tensor_" << fNCastedX << "[" << InputIndex;
-         out << "]);\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
-      } else {
-         out << SP << "// Y = Scale o InvStdDev (X - Mean)\n";
-         for (size_t i = 0; i < fAxis; i++) {
-            std::string iIdx = "axis_" + std::to_string(i);
-            out << SP << "for (size_t " << iIdx << " = 0; " << iIdx << " < " << inputShape[i]
-                         << "; " << iIdx << "++){\n";
-         }
-         for (size_t j = fAxis; j < fSize; j++) {
-            std::string jIdx = "axis_" + std::to_string(j);
-            out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j]
-                           << "; " << jIdx << "++){\n";
-         }
-         out << SP << SP << SP << "tensor_" << fNY << "[" << InputIndex << "] = tensor_" << fNScale;
-         out << "[" << normalizedIndex << "] * tensor_" << fNInvStdDev << "[" << axesIndex;
-         out << "] * (tensor_" << fNX << "[" << InputIndex << "] - tensor_" << fNMean << "[";
-         out << axesIndex << "]);\n";
-         for (size_t j = fAxis; j < fSize; j++) {
-            out << SP << SP << "}\n";
-         }
-         for (size_t i = fAxis; i < fSize; i++) {
-            out << SP << "}\n";
-         }
+
+      // set output mean and invStdDev if requested
+      if (!fNMean.empty())
+         out << SP << SP <<  "tensor_" << fNMean << "[" << axesIndex << "] = mean;\n";
+      if (!fNInvStdDev.empty())
+         out << SP << SP <<  "tensor_" << fNInvStdDev << "[" << axesIndex << "] = invStdDev;\n";
+
+      // scale and add bias
+
+      out << SP << "// Y = Scale o InvStdDev (X - Mean)\n";
+
+      for (size_t j = fAxis; j < fSize; j++) {
+         std::string jIdx = "axis_" + std::to_string(j);
+         out << SP << SP << "for (size_t " << jIdx << " = 0; " << jIdx << " < " << inputShape[j] << "; " << jIdx
+             << "++){\n";
       }
+      out << SP << SP << SP << "tensor_" << fNY << "[" << inputIndex << "] = tensor_" << fNScale;
+      out << "[" << scaleIndex << "] * invStdDev * (tensor_" << fNX << "[" << inputIndex << "] - mean)";
 
-      if (!fNB.empty()) {
-         std::string bias = "tensor_" + (fNBroadcastedB.empty() ? fNB : fNBroadcastedB);
-         out << SP << "// Add the bias to Y\n";
-         out << SP << "int " << opName << "_n = " << fLength << ";\n";
-         out << SP << "float " << opName << "_alpha = 1.;\n";
-         out << SP << "int " << opName << "_inc = 1;\n";
-         out << SP << "BLAS::saxpy_(&" << opName << "_n, &" << opName << "_alpha, " << bias << ", &";
-         out << opName << "_inc, " << "tensor_" << fNY << ", &" << opName << "_inc);\n";
+      // add bias if needed
+      if (!fNB.empty())
+         // assume bias has index as scale
+         out << " + tensor_" << fNB << "[" << biasIndex << "]";
+      out << ";\n";
+
+      // close loops on normalizing dim  [..,fAxis,...fSize-1]
+      for (size_t j = fAxis; j < fSize; j++) {
+         out << SP << SP << "}\n";
+      }
+      // close loops on the other dimensions [0,...,fAxis]
+      for (size_t i = 0; i < fAxis; i++) {
+         out << SP << "}\n";
       }
 
       return out.str();
@@ -339,5 +323,4 @@ public:
 
 } // namespace SOFIE
 
-
 #endif
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx
index 8fefa6d..1218b56 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_LeakyRelu.hxx
@@ -27,7 +27,8 @@ public:
    ROperator_LeakyRelu(){}
    ROperator_LeakyRelu(float alpha,std::string nameX, std::string nameY):
    falpha(alpha),fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY))
-   {
+   {  
+      fKind = OperatorKind::LEAKYRELU;
       if(std::is_same<T, float>::value){
          fType = "float";
       }
@@ -75,6 +76,61 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override {
+      std::string op;
+      op = "\n//------ LEAKY_RELU_KERNEL_ALPAKA\n";
+      op += "struct LeakyReluKernel {\n";
+      op += SP + "template<typename TAcc, typename T>\n";
+      op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements, T alpha) const {\n";
+      op += SP + SP + SP + "auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (idx < numElements) {\n";
+      op += SP + SP + SP + "out[idx] = data[idx] >= T(0) ? data[idx] : alpha * data[idx];\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n";
+      op += "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return "LeakyReluKernel leakyReluKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator LeakyRelu called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShape);
+      out << "\n//------ LEAKY_RELU_GPU_ALPAKA\n";
+      out << SP << "constexpr float " << OpName << "_alpha = " << std::setprecision(std::numeric_limits<float>::max_digits10) << falpha << ";\n";
+      out << SP << "auto const elementsPerThread_"<<fNX<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNX<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << fNX << " = {elementsPerGrid_" << fNX << ", elementsPerThread_" << fNX << "};\n";
+      out << SP << "auto const workDiv_" << fNX << " = alpaka::getValidWorkDiv(kernelCfg_" << fNX << ", devAcc, leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "), " << OpName << "_alpha);\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNX
+         << ", leakyReluKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "), " << OpName << "_alpha);\n";
+      out << SP <<"alpaka::wait(queue);\n";
+      return out.str();
+   }
+
+
+   std::string GetFusableOutputTensorName() override {
+      return fNY;
+   }
+
+   void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+      removal_func(fNX);
+      removal_func(fNY);
+      fNX = fusable_tensor_name;
+      fNY = fusable_tensor_name;
+      fInputTensorNames[0] =  fNX;
+      fOutputTensorNames[0] = fNY;
+   }
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc b/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc
index c03c1c2..c10c2a5 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_RNN.icc
@@ -1,7 +1,6 @@
 #ifndef SOFIE_ROPERATOR_RNN_I
 #define SOFIE_ROPERATOR_RNN_I
 
-
 namespace SOFIE {
 
 template <typename T>
@@ -230,7 +229,7 @@ auto ROperator_RNN<T>::Generate(std::string OpName)
    // set the input
    if (fAttrLayout == 0) {
       if (fType == "float") {
-         out << SP << "float *" << OpName << "_input = tensor_" << fNX << ";\n";
+         out << SP << "float const*" << OpName << "_input = tensor_" << fNX << ";\n";
       }
    } else {
       if (fUseSession)
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx
index 8af272d..fea9814 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Range.hxx
@@ -8,7 +8,6 @@
 #include <sstream>
 #include <algorithm>
 
-
 namespace SOFIE{
 
 template <typename T>
@@ -89,9 +88,9 @@ public:
          model.AddDynamicTensor(fNOutput, type, fShape);
       }
       if (model.Verbose()) {
-         std::cout << "Range -> output is " << fNOutput << " ";
-         if (fIsOutputConstant) std::cout << ConvertDynamicShapeToString(fShape) << std::endl;
-         else std::cout << ConvertDynamicShapeToString(model.GetDynamicTensorShape(fNOutput)) << std::endl;
+         std::cout << "Range -> output is " << fNOutput << " : " << ConvertDimShapeToString(fShape);
+         if (fIsOutputConstant) std::cout << " : " << ConvertValuesToString(model.GetTensorData<T>(fNOutput));
+         std::cout << std::endl;
       }
    }
 
@@ -121,5 +120,5 @@ public:
 };
 
 }//SOFIE
-
+   
 #endif //SOFIE_ROPERATOR_RANGE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx
index 8062dca..fcc3cd6 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Relu.hxx
@@ -24,6 +24,7 @@ public:
    ROperator_Relu(){}
    ROperator_Relu(std::string nameX, std::string nameY):
       fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){
+         fKind = OperatorKind::RELU;
          fInputTensorNames = { fNX };
          fOutputTensorNames = { fNY };
       }
@@ -42,11 +43,11 @@ public:
          throw std::runtime_error("TMVA SOFIE Relu Op Input Tensor " + fNX + " is not found in model");
       }
 
-      fShape = model.GetDynamicTensorShape(fNX);
+      fShape = model.GetDimTensorShape(fNX);
 
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNX), fShape);
       if (model.Verbose()) {
-         std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDynamicShapeToString(fShape) << std::endl;
+         std::cout << "Relu : " << fNX << " -> " << fNY << " " << ConvertDimShapeToString(fShape) << std::endl;
       }
    }
 
@@ -57,7 +58,7 @@ public:
          throw std::runtime_error("TMVA SOFIE Operator Relu called to Generate without being initialized first");
       }
       std::stringstream out;
-      auto length = ConvertDynamicShapeToLength(fShape);
+      auto length = ConvertDimShapeToLength(fShape);
       out << "\n//------ RELU\n";
       out << SP << "for (int id = 0; id < " << length << " ; id++){\n";
       out << SP << SP << "tensor_" << fNY << "[id] = ((tensor_" << fNX << "[id] > 0 )? tensor_" << fNX << "[id] : 0);\n";
@@ -65,6 +66,59 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) {
+      std::string op;
+      op = "\n//------ RELU_KERNEL_ALPAKA\n";
+
+      op = "\n//------ RELU_KERNEL_ALPAKA\n";
+      op += "struct ReluKernel {\n";
+      op += SP + "template<typename TAcc, typename T>\n";
+      op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n";
+      op += SP + SP + SP + "auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (idx < numElements) {\n";
+      op += SP + SP + SP + "out[idx] = data[idx] >= T(0) ? data[idx] : 0;\n";
+      op += SP + SP + "}\n";
+      op += SP + "}\n";
+      op += "};\n";
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "ReluKernel reluKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Relu called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      auto length = ConvertDimShapeToLength(fShape);
+      out << "\n//------ RELU_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNY<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNY<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << fNY << " = {elementsPerGrid_" << fNY << ", elementsPerThread_" << fNY << "};\n";
+      out << SP << "auto const workDiv_" << fNY << " = alpaka::getValidWorkDiv(kernelCfg_" << fNY << ", devAcc, reluKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNY
+         << ", reluKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "));\n";
+      return out.str();
+   }
+
+   std::string GetFusableOutputTensorName() override {
+         return fNY;
+   }
+
+   void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+      removal_func(fNX);
+      removal_func(fNY);
+      fNX = fusable_tensor_name;
+      fNY = fusable_tensor_name;
+      fInputTensorNames[0] =  fNX;
+      fOutputTensorNames[0] = fNY;
+   }
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx
index 66a7e09..2b3391c 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Reshape.hxx
@@ -6,7 +6,10 @@
 #include "SOFIE/RModel.hxx"
 
 #include <cassert>
+#include <cctype>
 #include <sstream>
+#include <algorithm>
+
 
 namespace SOFIE{
 
@@ -19,17 +22,20 @@ class ROperator_Reshape final : public ROperator
 private:
 
    bool fVerbose = false;
+   bool fDimInput = false;
+   bool fDynamicShape = false;
    ReshapeOpMode fOpMode = Reshape;   // type of Reshape operator
 
    int fAllowZero = 0; // (for Reshape) zero in tensor shape makes output shape equal to input tensor shape
    int fAxis = 1;      // (for Flatten)
 
    std::string fNData;        // input data tensor name
-   std::string fNShape;       // reshape tensor name
+   std::string fNInput2;       // reshape or axes tensor name depending on operator
    std::string fNOutput;               // output tensor name
-   std::vector<size_t> fShapeInput;     // input shape data
-   std::vector<size_t> fShapeOutput;   // output shape data
+   std::vector<Dim> fShapeInput;     // input shape data
+   std::vector<Dim> fShapeOutput;   // output shape data
    std::vector<int64_t> fAttrAxes;         // axes attributes (provided for all version of Squeeze/Unsqueeze)
+   std::vector<int64_t> fShape;     // shape tensor values provided for Reshape
 
 public:
 
@@ -42,16 +48,16 @@ public:
    }
 
    ROperator_Reshape(){}
-   ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameShape, std::string nameOutput)
-      : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNShape(UTILITY::Clean_name(nameShape)),
-      fNOutput(UTILITY::Clean_name(nameOutput))
+   ROperator_Reshape(ReshapeOpMode opMode, int attr_value, std::string nameData, std::string nameInput2, std::string nameOutput)
+      : fOpMode(opMode), fNData(UTILITY::Clean_name(nameData)), fNInput2(UTILITY::Clean_name(nameInput2)),
+         fNOutput(UTILITY::Clean_name(nameOutput))
    {
       if (opMode == Reshape) fAllowZero = attr_value;
       if (opMode == Flatten) fAxis = attr_value;
 
       fInputTensorNames = { fNData };
-      if(!fNShape.empty()){
-         fInputTensorNames.emplace_back(fNShape);
+      if(!fNInput2.empty()){
+         fInputTensorNames.emplace_back(fNInput2);
       }
       fOutputTensorNames = { fNOutput };
    }
@@ -63,6 +69,8 @@ public:
         fAttrAxes(attrAxes)
    {
       assert(fOpMode == Squeeze || fOpMode == Unsqueeze);
+      fInputTensorNames = { fNData };
+      fOutputTensorNames = { fNOutput };
    }
 
    // output type is same as input
@@ -70,94 +78,165 @@ public:
       auto ret = std::vector<ETensorType>(1, input[0]);
       return ret;
    }
+   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
+      return input;
+   }
 
    // output shape
-   std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
-      std::vector<std::vector<size_t>> ret;
+   std::vector<std::vector<Dim>> ShapeInference(const std::vector<std::vector<Dim>> & input)  {
+      std::vector<std::vector<Dim>> ret;
       auto & input_shape = input[0];
-
       if (fOpMode == Reshape) {
-         if (input.size() != 2) throw std::runtime_error("TMVA SOFIE Reshape Op needs 2 input tensors");
-         auto output_shape = input[1]; // the provided shape
-         size_t input_length = ConvertShapeToLength(input_shape);
-         size_t output_length = ConvertShapeToLength(output_shape);
-         // (input_length == output_length) is the easy case : (2,3,4) -> (2,12)
-         if (input_length != output_length) {
-            if ((output_length == 0 && fAllowZero == 0) || static_cast<long>(output_length)  < 0) {
-               // in this case value 0 or -1 in shape are automatically corrected
-               bool replacementDone = false;
-               for (size_t i = 0; i < output_shape.size(); i++) {
-                  if (output_shape[i] == 0 || output_shape[i] == static_cast<size_t>(-1)) {
-                     if (replacementDone) {
-                        throw std::runtime_error("TMVA Reshape Op : output shape has multiple negative or zero values");
+         // correct the provided shape (here we have the value) for 0 or -1
+         std::vector<Dim> output_shape(fShape.size());
+         assert(!fShape.empty() && !fDynamicShape);
+         for (size_t i = 0; i < output_shape.size(); i++) {
+            if (fShape[i] > 0 || (fAllowZero && fShape[i] >= 0))
+               output_shape[i] = Dim{ static_cast<size_t>(fShape[i]) };
+            else if (!fAllowZero && fShape[i] == 0)
+               output_shape[i] = input_shape[i];
+         }
+         // now case of -1 in shape
+         for (size_t i = 0; i < output_shape.size(); i++) {
+            if (fShape[i] == -1) {
+               auto tmp = output_shape;
+               tmp.erase(tmp.begin() + i);
+               auto tmp_length = ConvertDimShapeToLength(tmp);
+               auto input_length = ConvertDimShapeToLength(input_shape);
+               if (fVerbose)
+                  std::cout << "reshape- try simplifying " << ConvertDimShapeToString(input_shape) << " with length "
+                            << input_length << " to " << tmp_length << std::endl;
+
+               if (IsInteger(tmp_length) && IsInteger(input_length))
+                  output_shape[i] = Dim{static_cast<size_t>(std::stoi(input_length) / std::stoi(tmp_length))};
+               else if (IsInteger(tmp_length) && std::stoi(tmp_length) == 1) {
+                  output_shape[i] = Dim{input_length, static_cast<size_t>(-1)};
+               }
+               else {
+                  //we can try simplifying expression if tmp_length is integer and part of input_length
+                  // contains tmp_length
+                  bool canSimplify = false;
+                  std::vector <Dim> reduced_input;
+                  if (IsInteger(tmp_length)) {
+
+                     // try to tokenize with * the input length
+
+                     std::stringstream ss(input_length);
+
+                     std::string token;
+
+                     // Tokenizing w.r.t. space '*'
+                     while(getline(ss, token, '*'))
+                     {
+                        // remove any whitespace
+                        token.erase(std::remove_if(token.begin(), token.end(),
+                                                   [](unsigned char x) { return std::isspace(x); }), token.end());
+                        if (token != tmp_length) {
+                           if (IsInteger(token)) {
+                                 size_t il = static_cast<size_t>(std::stoi(input_length));
+                                 size_t tl = static_cast<size_t>(std::stoi(tmp_length));
+                                 if ((il % tl) == 0) {
+                                 canSimplify = true;
+                                 reduced_input.push_back(Dim{il / tl});
+                                 }
+                           } else {
+                              reduced_input.push_back(Dim{token});
+                           }
+                        } else {
+                           // token is equal to tmp_length, can be not considered and is simplified
+                           canSimplify = true;
+                        }
                      }
-                     auto tmp = output_shape;
-                     tmp.erase(tmp.begin() + i);
-                     auto tmp_length = ConvertShapeToLength(tmp);
-                     output_shape[i] = input_length / tmp_length;
-                     replacementDone = true;
                   }
+                  if (canSimplify) {
+                     // if length contains * we need to add some brackets
+                     std::string res_shape = ConvertDimShapeToLength(reduced_input);
+                     if (res_shape.find('*') != std::string::npos)
+                        output_shape[i] = Dim{std::string("(") + res_shape + ")", static_cast<size_t>(-1)};
+                     else
+                        output_shape[i] = Dim{res_shape};
+                  }
+                  if (!canSimplify)
+                     output_shape[i] = Dim{std::string("(") + input_length + " / (" + tmp_length + "))", static_cast<size_t>(-1)};
                }
-               if (fVerbose)
-                  std::cout << "Reshape: correct output shape from " << ConvertShapeToString(input[1])
-                        << " to " << ConvertShapeToString(output_shape) << std::endl;
-            }
-            if (ConvertShapeToLength(output_shape) != input_length) {
-               throw std::runtime_error("TMVA Reshape Op : Invalid  shapes : " + ConvertShapeToString(input_shape) +
-                                        ConvertShapeToString(output_shape));
+
+               break; // cannot have more than -1
             }
+            //  throw std::runtime_error(
+            //                   "TMVA Reshape Op : output shape has multiple negative or zero values");
+         }
+
+         if (fVerbose)
+            std::cout << "Reshape: correct output shape  to " << ConvertDimShapeToString(output_shape) << std::endl;
+
+         if (!fDimInput && ConvertDimShapeToLength(output_shape) != ConvertDimShapeToLength(input_shape)) {
+            throw std::runtime_error("TMVA Reshape Op : Invalid  shapes : " + ConvertDimShapeToString(input_shape) +
+                                     ConvertDimShapeToString(output_shape));
          }
          ret.push_back(output_shape);
 
       } else if (fOpMode == Flatten) {
-         // flattenig case
-         size_t inputSize = ConvertShapeToLength(input_shape);
-         size_t b = input[0][0];
-         std::vector<size_t> newShape = {b, inputSize / b};
+         // flatten case
+         if (fAxis < 0)
+            fAxis += input_shape.size();
+         auto s1 = std::vector<Dim>(input_shape.begin(), input_shape.begin() + fAxis);
+         auto s2 = std::vector<Dim>(input_shape.begin() + fAxis, input_shape.end());
+         auto l1 = ConvertDimShapeToLength(s1);
+         auto l2 = ConvertDimShapeToLength(s2);
+         std::vector<Dim> newShape = {Dim{l1}, Dim{l2}};
          ret.push_back(newShape);
-
       } else if (fOpMode == Squeeze) {
          // squeeze
          // assume no axis is provided - remove all axes with value equal to 1
-         auto output_shape = input[0];
-         if (input.size() == 1) {
+         auto output_shape = input_shape;
+         if (fAttrAxes.empty()) {
             size_t i = 0;
             while (i < output_shape.size()) {
-               if (output_shape[i] == 1 ) {
+               if (output_shape[i] == Dim{1}) {
                   output_shape.erase(output_shape.begin() + i);
-               }
-               else {
+               } else {
                   i++;
                }
             }
-         } else if (input.size() == 2) {
-            auto & axes = input[1];
-            for (size_t i = 0; i < axes.size(); i++){
-               if (output_shape[axes[i]] != 1)
-                  throw std::runtime_error("TMVA Squeeze Op : Invalid  axes : " + ConvertShapeToString(axes) +
-                                           ConvertShapeToString(output_shape));
-               output_shape.erase(output_shape.begin() + axes[i]);
+         } else {
+            std::cout << "getting shape for Squeeze...from attribute\n";
+            auto axes = fAttrAxes;
+            for (size_t i = 0; i < axes.size(); i++) {
+               std::cout << i << "  " << axes[i] << std::endl;
+               if (axes[i] < 0)
+                  axes[i] += input_shape.size();
+               if (!(output_shape[axes[i]] == Dim{1}))
+                  throw std::runtime_error("TMVA Squeeze Op : Invalid  axis value " + std::to_string(axes[i]) +
+                                           " for " + ConvertDimShapeToString(output_shape));
+            }
+            // for calling vector::erase we must sort axes in decreasing order to avoid
+            std::sort(axes.begin(), axes.end(), std::greater<int>());
+            for (auto & axis : axes) {
+               std::cout << "erase give axis " << axis << "  -> ";
+               for (auto & o : output_shape) std::cout << o << " , ";
+               std::cout << std::endl;
+               output_shape.erase(output_shape.begin() + axis);
             }
          }
          ret.push_back(output_shape);
       }
-
       else if (fOpMode == Unsqueeze) {
          // unsqueeze
-         assert(input.size() == 2);
-         auto output_shape = input[0];
-         auto &axes = input[1];
+         std::cout << "doing unsqueeze....\n";
+         assert(!fAttrAxes.empty());
+         auto output_shape = input_shape;
+         auto &axes = fAttrAxes;
          // output rank
          int64_t r = input[0].size() + axes.size();
-         for (auto & a : axes) {
+         for (auto &a : axes) {
             int64_t i = static_cast<int64_t>(a);
-            if ( i < -r  || i > r - 1 )
+            if (i < -r || i > r - 1)
                throw std::runtime_error("TMVA Unsqueeze Op - axes input is not in correct range");
             if (i >= 0)
-               output_shape.insert(output_shape.begin() + i, 1);
+               output_shape.insert(output_shape.begin() + i, Dim{1});
             else
-               //negative axes
-               output_shape.insert(output_shape.end() + i + 1, 1);
+               // negative axes
+               output_shape.insert(output_shape.end() + i + 1, Dim{1});
          }
          ret.push_back(output_shape);
       }
@@ -167,33 +246,55 @@ public:
    void Initialize(RModel& model) override {
 
       fVerbose = model.Verbose();
+      if (fVerbose)
+         std::cout << "initialize reshape op type " << fOpMode << " - " << fNInput2 << " " << fNData << std::endl;
+
       if (model.CheckIfTensorAlreadyExist(fNData) == false) {
           // input must be a graph input, or already initialized intermediate tensor
          throw std::runtime_error("TMVA Reshape Op Input Tensor " + fNData + "  is not found in model");
       }
-      fShapeInput = model.GetTensorShape(fNData);
-      // check if optional shape tensor exist
-      if (!fNShape.empty()) {
-         if (model.CheckIfTensorAlreadyExist(fNShape)) {
-            auto dptr = model.GetInitializedTensorData(fNShape);
-            auto input_shape = static_cast<int64_t *>(dptr.get());
-            auto vec = model.GetTensorShape(fNShape);
-            assert(vec.size() == 1);
-            size_t n = vec[0]; // size of shape input tensor
-
-            std::vector<size_t> descShape(n);
-            std::copy(input_shape, input_shape + n, descShape.begin());
-            fShapeOutput = ShapeInference({fShapeInput, descShape})[0];
-            // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed
-            model.SetNotWritableInitializedTensor(fNShape);
+      fShapeInput = model.GetDimTensorShape(fNData);
+      fDimInput = model.IsDynamicTensor(fNData);
+      // check if optional tensor exists defining shape or axes
+      if (!fNInput2.empty()) {
+         if (model.CheckIfTensorAlreadyExist(fNInput2)) {
+            if (model.IsInitializedTensor(fNInput2)) {
+               // assume input shape is an initialized tensor
+               auto dptr = model.GetInitializedTensorData(fNInput2);
+               auto values = static_cast<int64_t *>(dptr.get());
+               auto vec = model.GetTensorShape(fNInput2);
+               size_t n = 1;
+               if (vec.size() > 0)
+                  n = vec[0]; // size of shape input tensor
+               // copy values in fShape vector or fAttrAxes
+               if (fOpMode == Reshape)
+                  fShape = std::vector<int64_t>(values, values + n);
+               else
+                  fAttrAxes = std::vector<int64_t>(values, values + n);
+
+               fShapeOutput = ShapeInference({fShapeInput})[0];
+               // set flag to not write tensor in weight file. Its data will be hard-coded in way model is constructed
+               model.SetNotWritableInitializedTensor(fNInput2);
+            } else if (model.IsShapeTensor(fNInput2)) {
+               auto shapeData = model.GetShapeTensorValues(fNInput2);
+               fShapeOutput = shapeData;
+            } else {
+               // we cannot get shape at initialization time but at run-time
+               fDynamicShape = true;
+               // size of shape output us given by size of shape input tensor
+               auto shapeInput2 = model.GetTensorShape(fNInput2);
+               fShapeOutput.resize(shapeInput2[0]);
+               for (size_t i = 0; i < fShapeOutput.size(); i++) {
+                  fShapeOutput[i] = Dim{ std::string("s_") + fNOutput + "_" + std::to_string(i)};
+               }
+            }
          } else {
-            throw std::runtime_error("TMVA Reshape Op Shape Tensor " + fNShape + " is not found in model");
+            throw std::runtime_error("TMVA Reshape Op 2nd input Tensor " + fNInput2 + " is not found in model");
          }
       } else if (!fAttrAxes.empty()) {
-         // case fNShape is empty and axes are provided as attributes
-         std::vector<size_t> descShape(fAttrAxes.size());
-         std::copy(fAttrAxes.begin(), fAttrAxes.end(), descShape.begin());
-         fShapeOutput = ShapeInference({fShapeInput, descShape})[0];
+         // case fNShape is empty and axes are provided as attributes (e.g. for Unsqueeze)
+         std::cout << "attribute axes exists\n";
+         fShapeOutput = ShapeInference({fShapeInput})[0];
       } else if (fOpMode == Flatten || fOpMode == Squeeze) {
          fShapeOutput = ShapeInference({fShapeInput})[0];
       } else {
@@ -203,47 +304,103 @@ public:
       if (model.IsInitializedTensor(fNData) && model.GetTensorType(fNData) == ETensorType::INT64) {
          fIsOutputConstant = true;
          auto inputData = static_cast<int64_t*>(model.GetInitializedTensorData(fNData).get());
-         if (ConvertShapeToLength(fShapeInput) != ConvertShapeToLength(fShapeOutput))
+         auto o_shape = ConvertShapeToInt(fShapeOutput);
+         if (ConvertShapeToLength(ConvertShapeToInt(fShapeInput)) != ConvertShapeToLength(o_shape) )
             throw std::runtime_error("TMVA Reshape Op : Invalid Input/Output lengths");
-         model.AddConstantTensor<int64_t>(fNOutput, fShapeOutput, inputData);
+         model.AddConstantTensor<int64_t>(fNOutput, o_shape, inputData);
          if (model.Verbose()) {
-            std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " -->  " << fNOutput << " (constant) " << ConvertShapeToString(fShapeOutput)  << " : " <<
-            ConvertValuesToString(ConvertShapeToLength(fShapeOutput), inputData) << std::endl;
+            std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " -->  " << fNOutput << " (constant) " << ConvertDimShapeToString(fShapeOutput)  << " : " <<
+            ConvertValuesToString(ConvertShapeToLength(o_shape), inputData) << std::endl;
          }
-      } else {
+      }
+      // for shape tensors we can have it if output shape is size==1 or a scalar
+      else if (model.IsShapeTensor(fNData) && fShapeOutput.size() <=1) {
+         fIsOutputConstant = true;
+         auto inputData = model.GetShapeTensorValues(fNData);
+         model.AddShapeTensor(fNOutput, inputData);
+         if (model.Verbose()) {
+            std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " -->  " << fNOutput << " (shape) " << ConvertDimShapeToString(fShapeOutput)  << " : " <<
+            ConvertDimShapeToString(inputData) << std::endl;
+         }
+      }
+      else {
          // non-constant case
          model.AddIntermediateTensor(fNOutput, model.GetTensorType(fNData), fShapeOutput);
          if (model.Verbose())
-            std::cout << Name() << " : " << fNData << " " << ConvertShapeToString(fShapeInput) << " -->  "<< fNOutput << "  " << ConvertShapeToString(fShapeOutput)  << std::endl;
+            std::cout << Name() << " : " << fNData << " " << ConvertDimShapeToString(fShapeInput) << " -->  "<< fNOutput << "  " << ConvertDimShapeToString(fShapeOutput)  << std::endl;
       }
    }
 
-   std::string Generate(std::string OpName) override {
+   std::string Generate(std::string opName) override {
       if (fIsOutputConstant) return "";  //no op for constant tensors
 
-      OpName = "op_" + OpName;
-
-      // output of reshape is same as input
-      size_t length = ConvertShapeToLength(fShapeOutput);
-      if (length != ConvertShapeToLength(fShapeInput)) {
-         throw std::runtime_error("TMVA SOFIE Reshape Op : wrong output shape - is " +
-                                  ConvertShapeToString(fShapeOutput) + " and input is " +
-                                  ConvertShapeToString(fShapeInput));
-      }
       std::stringstream out;
-      std::string opName = "Reshape";
+      std::string opType = "Reshape";
       if (fOpMode == Flatten)
-         opName = "Flatten";
+         opType = "Flatten";
       else if (fOpMode == Squeeze)
-         opName = "Squeeze";
+         opType = "Squeeze";
       else if (fOpMode == Unsqueeze)
-         opName = "Unsquueze";
+         opType = "Unsquueze";
+
+      out << SP << "///--------" << opType << " operator " << opName << " --> " << ConvertDimShapeToString(fShapeOutput) << "\n";
 
-      out << SP << "///--------" << opName << " operator\n" << std::endl;
-      out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << length << ", " << "tensor_" << fNOutput
+      // in case of dynamic output shape we need to set the shape value from input shape tensor
+      // and take case of the zero values
+      if (fDynamicShape) {
+         for (size_t i = 0; i < fShapeOutput.size(); i++) {
+            // since fNInput2 values are int64_t, should we check if they are negative?
+            out << SP << "size_t " << fShapeOutput[i].param << " = " << "tensor_" << fNInput2 << "[" << i << "];\n";
+            if (!fAllowZero)
+               out << SP << "if (tensor_" << fNInput2 << "[" << i << "] <= 0 ) "
+                         <<  fShapeOutput[i].param << " = " <<  fShapeInput[i] << ";\n";
+         }
+      }
+
+      // output of reshape is same as input
+      auto lengthOut = ConvertDimShapeToLength(fShapeOutput);
+      auto lengthIn = ConvertDimShapeToLength(fShapeInput);
+      if (lengthOut != lengthIn) {
+         // check needs to be done at run-time
+         out << SP << "if (" << lengthOut << "!=" << lengthIn << ")\n";
+         out << "throw std::runtime_error(\"TMVA SOFIE Reshape Op : output lengths is different than input one\");\n";
+      }
+
+
+      out << SP << "std::copy( tensor_" << fNData << ", tensor_" << fNData << " + " << lengthIn << ", " << "tensor_" << fNOutput
           << ");\n";
       return out.str();
    }
+std::string Generate_GPU_ALPAKA(std::string opName) override {
+    if (fIsOutputConstant) return "";
+
+    opName = "op_" + opName;
+
+    std::string opType = "Reshape";
+    if (fOpMode == Flatten)   opType = "Flatten";
+    else if (fOpMode == Squeeze)   opType = "Squeeze";
+    else if (fOpMode == Unsqueeze) opType = "Unsqueeze";
+
+    std::stringstream out;
+    out << SP << "///------- " << opType << " operator " << opName << "\n";
+
+    if (fDynamicShape) {
+        auto lengthOut = ConvertDimShapeToLength(fShapeOutput);
+        auto lengthIn  = ConvertDimShapeToLength(fShapeInput);
+        if (lengthOut != lengthIn) {
+            out << SP << "if (" << lengthOut << " != " << lengthIn << ")\n";
+            out << SP << SP << "throw std::runtime_error(\"TMVA SOFIE " << opType
+                << " Op : output length is different from input length\");\n";
+        }
+    }
+
+    out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNOutput
+        << ", deviceBuf_" << fNData << ");\n";
+    out << SP << "alpaka::wait(queue);\n";
+
+    return out.str();
+}
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx
index 6951017..5b17a79 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_ScatterElements.hxx
@@ -168,6 +168,114 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeY.empty()) {
+         throw std::runtime_error("TMVA SOFIE ScatterElements Op called to Generate without being initialized first");
+      }
+
+      const std::size_t D = fShapeI.size();
+
+      auto strideY = UTILITY::ComputeStrideFromShape(fShapeY);
+      auto strideI = UTILITY::ComputeStrideFromShape(fShapeI);
+
+      std::size_t totalElements = 1;
+      for (std::size_t d = 0; d < D; ++d)
+         totalElements *= fShapeI[d];
+
+      std::string op;
+      op  = "\n//------ SCATTERELEMENTS_KERNEL_ALPAKA\n";
+      op += SP + "struct ScatterElementsKernel_" + opName + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T* Y,\n";
+      op += SP + SP + SP + "int64_t const* I,\n";
+      op += SP + SP + SP + "T const* U,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      op += SP + SP + SP + SP + "std::size_t remaining = elem_idx;\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + "std::size_t const idx_" + std::to_string(d)
+               + " = remaining / " + strideI[d] + ";\n";
+         op += SP + SP + SP + SP + "remaining -= idx_" + std::to_string(d)
+               + " * " + strideI[d] + ";\n";
+      }
+      op += "\n";
+
+      op += SP + SP + SP + SP + "int64_t iAxis = I[elem_idx];\n";
+      op += SP + SP + SP + SP + "if (iAxis < 0) iAxis += " + std::to_string(fShapeY[fAxis]) + ";\n\n";
+
+      op += SP + SP + SP + SP + "std::size_t const out_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         std::string coord = (d == (std::size_t)fAxis)
+               ? "static_cast<std::size_t>(iAxis)"
+               : "idx_" + std::to_string(d);
+         op += SP + SP + SP + SP + SP + coord + " * " + std::to_string(strideY[d]);
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
+      }
+
+      if (fReduction.empty() || fReduction == "none") {
+         op += SP + SP + SP + SP + "Y[out_idx] = U[elem_idx];\n";
+      } else if (fReduction == "add") {
+         op += SP + SP + SP + SP + "alpaka::atomicAdd(acc, &Y[out_idx], U[elem_idx]);\n";
+      } else if (fReduction == "mul") {
+         op += SP + SP + SP + SP + "alpaka::atomicMul(acc, &Y[out_idx], U[elem_idx]);\n";
+      } else if (fReduction == "max") {
+         op += SP + SP + SP + SP + "alpaka::atomicMax(acc, &Y[out_idx], U[elem_idx]);\n";
+      } else if (fReduction == "min") {
+         op += SP + SP + SP + SP + "alpaka::atomicMin(acc, &Y[out_idx], U[elem_idx]);\n";
+      }
+
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+
+      return op;
+   }
+
+std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+    opName = "op_" + opName;
+    return SP + "ScatterElementsKernel_" + opName + " scatterElementsKernel_" + opName + ";\n";
+}
+
+std::string Generate_GPU_ALPAKA(std::string opName) override {
+    opName = "op_" + opName;
+    if (fShapeY.empty()) {
+        throw std::runtime_error("TMVA SOFIE ScatterElements Op called to Generate without being initialized first");
+    }
+
+    std::size_t totalElements = ConvertShapeToLength(fShapeI);
+
+    std::stringstream out;
+    out << "\n//------ SCATTERELEMENTS_GPU_ALPAKA\n";
+
+    out << SP << "alpaka::memcpy(queue, deviceBuf_" << fNY << ", deviceBuf_" << fNX << ");\n";
+    out << SP << "alpaka::wait(queue);\n\n";
+
+    out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+    out << SP << "auto const elementsPerGrid_" << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+    out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << opName << " = {elementsPerGrid_" << opName << ", elementsPerThread_" << opName << "};\n";
+    out << SP << "auto const workDiv_" << opName << " = alpaka::getValidWorkDiv(kernelCfg_" << opName << ", devAcc, scatterElementsKernel_" << opName
+        << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNI << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNU << ")"
+        << ", static_cast<Idx>(" << totalElements << "));\n";
+    out << SP << "alpaka::exec<Acc>(queue, workDiv_" << opName
+        << ", scatterElementsKernel_" << opName
+        << ", alpaka::getPtrNative(deviceBuf_" << fNY << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNI << ")"
+        << ", alpaka::getPtrNative(deviceBuf_" << fNU << ")"
+        << ", static_cast<Idx>(" << totalElements << "));\n";
+    out << SP <<"alpaka::wait(queue);\n";
+    return out.str();
+}
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx
index 52bdeae..34e69eb 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Shape.hxx
@@ -101,6 +101,26 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      // no need to generate code if the output is constant
+      if (fIsOutputConstant) return "";
+
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Shape op called to Generate without being initialized first");
+      }
+      std::stringstream out;
+
+      out << "\n//------ Shape\n";
+      // add a dummy statement to avoid warning for unused input
+      out << SP << "(void) deviceBuf_" << fNX << ";\n";
+      size_t length = ConvertShapeToLength(fOutput_shape);
+      for (size_t id = 0; id < length; id++) {
+         out << SP << "deviceBuf_" << fNY << "["<< id << "] = " << fShape[fStart+id] << ";\n";
+      }
+      return out.str();
+   }
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx
index 68edd01..77f989c 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Sigmoid.hxx
@@ -23,6 +23,7 @@ public:
    ROperator_Sigmoid(){}
    ROperator_Sigmoid(std::string nameX, std::string nameY):
       fNX(UTILITY::Clean_name(nameX)), fNY(UTILITY::Clean_name(nameY)){
+         fKind = OperatorKind::SIGMOID;
          fInputTensorNames = { fNX };
          fOutputTensorNames = { fNY };
       }
@@ -61,6 +62,60 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA(std::string /*opName*/) override {
+      std::string op;
+      op = "\n//------ SIGMOID_KERNEL_ALPAKA\n";
+      op += "struct SigmoidKernel {\n";
+      op += SP + "template<typename TAcc, typename T>\n";
+      op += SP + "ALPAKA_FN_ACC void operator()(TAcc const & acc, T const* __restrict__ data, T* __restrict__ out, std::size_t numElements) const {\n";
+      op += SP + SP + "const auto idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + "if(idx < numElements) {\n";
+      op += SP + SP + SP + SP + "out[idx] = static_cast<T>(1) / (static_cast<T>(1) + exp(-data[idx]));\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
+      return op;
+   }
+
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string /*opName*/) override {
+      return SP + "SigmoidKernel sigmoidKernel;\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      OpName = "op_" + OpName;
+      if (fShape.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Sigmoid called to Generate without being initialized first");
+      }
+
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShape);
+      out << "\n//------ SIGMOID_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNX<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNX<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << fNX << " = {elementsPerGrid_" << fNX << ", elementsPerThread_" << fNX << "};\n";
+      out << SP << "auto const workDiv_" << fNX << " = alpaka::getValidWorkDiv(kernelCfg_" << fNX << ", devAcc, sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNX
+         << ", sigmoidKernel, alpaka::getPtrNative(deviceBuf_" << fNX
+         << "), alpaka::getPtrNative(deviceBuf_" << fNY << "), static_cast<Idx>(" << length << "));\n";
+      out << SP <<"alpaka::wait(queue);\n";
+      return out.str();
+   }
+
+   std::string GetFusableOutputTensorName() override {
+      return fNY;
+   }
+
+   void UpdateFusableTensorName(std::string fusable_tensor_name, const std::function<void(const std::string&)>& removal_func){
+      removal_func(fNX);
+      removal_func(fNY);
+      fNX = fusable_tensor_name;
+      fNY = fusable_tensor_name;
+      fInputTensorNames[0] =  fNX;
+      fOutputTensorNames[0] = fNY;
+   }
+
    std::vector<std::string> GetStdLibs() override { return { std::string("cmath") };}
 };
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx
index 63fbcb3..c9af13e 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Split.hxx
@@ -153,6 +153,111 @@ public:
       return out.str();
    }
 
+std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+    opName = "op_" + opName;
+    if (fOutputShapes.empty())
+        throw std::runtime_error("TMVA SOFIE Operator Split called to Generate without being initialized first");
+
+    const std::size_t D   = fInputShape.size();
+    const std::size_t Nin = fNYs.size();
+
+    auto inputStrides = UTILITY::ComputeStrideFromShape(fInputShape);
+
+    std::string op;
+    op  = "\n//------ SPLIT_KERNEL_ALPAKA\n";
+   std::cout<<"Generating GPU kernel for Split operator with input shape "<< ConvertShapeToString(fInputShape) << " and output shapes : ";
+    for (std::size_t i = 0; i < Nin; ++i) {
+      std::cout<<"Loop running for output "<<i<<" with shape "<< ConvertShapeToString(fOutputShapes[i]) << " and split size "<<fSplit[i]<<std::endl;
+        auto outputStrides = UTILITY::ComputeStrideFromShape(fOutputShapes[i]);
+
+        std::size_t axis_offset = 0;
+        for (std::size_t k = 0; k < i; ++k)
+            axis_offset += fSplit[k];
+
+        std::string kname = "SplitKernel_" + opName + "_" + std::to_string(i);
+
+        op += SP + "struct " + kname + " {\n";
+        op += SP + SP + "template<typename TAcc, typename T>\n";
+        op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+        op += SP + SP + SP + "TAcc const& acc,\n";
+        op += SP + SP + SP + "T const* input,\n";
+        op += SP + SP + SP + "T* output,\n";
+        op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+        op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+        op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+        op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+        op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+        for (std::size_t d = 0; d < D; ++d) {
+            op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+                + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % "
+                + std::to_string(fOutputShapes[i][d]) + "u;\n";
+        }
+        op += "\n";
+
+        op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+        for (std::size_t d = 0; d < D; ++d) {
+            std::string coord = (d == static_cast<std::size_t>(fAxis))
+                ? ("(out_" + std::to_string(d) + " + " + std::to_string(axis_offset) + "u)")
+                : ("out_" + std::to_string(d));
+            op += SP + SP + SP + SP + SP + coord + " * " + std::to_string(inputStrides[d]) + "u";
+            op += (d + 1 < D) ? " +\n" : ";\n\n";
+        }
+
+        op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n";
+        op += SP + SP + SP + "}\n";
+        op += SP + SP + "}\n";
+        op += SP + "};\n\n";
+    }
+    std::cout<<"Finished generating GPU kernel for Split operator "<<op<<std::endl;
+    return op;
+}
+
+std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+    opName = "op_" + opName;
+    std::string op;
+    for (std::size_t i = 0; i < fNYs.size(); ++i) {
+        std::string kname = "SplitKernel_" + opName + "_" + std::to_string(i);
+        op += SP + kname + " splitKernel_" + opName + "_" + std::to_string(i) + ";\n";
+    }
+    return op;
+}
+
+std::string Generate_GPU_ALPAKA(std::string opName) override {
+    opName = "op_" + opName;
+    if (fOutputShapes.empty())
+        throw std::runtime_error("TMVA SOFIE Operator Split called to Generate without being initialized first");
+
+    std::stringstream out;
+    out << "\n//------ SPLIT_GPU_ALPAKA\n";
+
+    for (std::size_t i = 0; i < fNYs.size(); ++i) {
+        std::size_t length = ConvertShapeToLength(fOutputShapes[i]);
+        std::string kname  = "splitKernel_" + opName + "_" + std::to_string(i);
+
+        out << SP << "{\n";
+        out << SP << SP << "auto const elementsPerThread_" << i << " = Vec::all(static_cast<Idx>(1));\n";
+        out << SP << SP << "auto const elementsPerGrid_"   << i << " = Vec::all(Idx{" << length << "});\n";
+        out << SP << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << i
+            << " = {elementsPerGrid_" << i << ", elementsPerThread_" << i << "};\n";
+        out << SP << SP << "auto const workDiv_" << i << " = alpaka::getValidWorkDiv(kernelCfg_" << i
+            << ", devAcc, " << kname
+            << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+            << ", alpaka::getPtrNative(deviceBuf_" << fNYs[i] << ")"
+            << ", static_cast<Idx>(" << length << "));\n";
+        out << SP << SP << "alpaka::exec<Acc>(queue, workDiv_" << i
+            << ", " << kname
+            << ", alpaka::getPtrNative(deviceBuf_" << fNX << ")"
+            << ", alpaka::getPtrNative(deviceBuf_" << fNYs[i] << ")"
+            << ", static_cast<Idx>(" << length << "));\n";
+        out << SP <<"alpaka::wait(queue);\n";
+        out << SP << "}\n";
+    }
+    return out.str();
+}
+
 };
 
 }//SOFIE
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx
index 354fbe3..608308c 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Tile.hxx
@@ -19,14 +19,17 @@ private:
    std::string fNRepeats;
    std::string fNInput;
    std::string fNY;
-   std::vector<size_t>fShapeInput;
+   std::vector<size_t> fShapeInput;
    std::vector<size_t> fShapeY;
+   std::vector<size_t> fRepeats;
 
 public:
    ROperator_Tile(){}
    ROperator_Tile(std::string nameRepeat, std::string nameInput, std::string nameY):
-      fNRepeats(UTILITY::Clean_name(nameRepeat)),fNInput(UTILITY::Clean_name(nameInput)), fNY(UTILITY::Clean_name(nameY)){
-         fInputTensorNames = { fNRepeats, fNInput };
+      fNRepeats(UTILITY::Clean_name(nameRepeat)),
+      fNInput(UTILITY::Clean_name(nameInput)),
+      fNY(UTILITY::Clean_name(nameY)) {
+         fInputTensorNames  = { fNRepeats, fNInput };
          fOutputTensorNames = { fNY };
       }
 
@@ -36,114 +39,214 @@ public:
 
    std::vector<std::vector<size_t>> ShapeInference(std::vector<std::vector<size_t>> input) override {
       std::vector<size_t> ret = input[0];
-
-      for(size_t i=0; i < input[1].size(); i++) {
-            ret[i]=ret[i]*input[1][i];
-      }
+      for (size_t i = 0; i < input[1].size(); i++)
+         ret[i] = ret[i] * input[1][i];
       return {ret};
    }
 
    void Initialize(RModel& model) override {
-       //input must be a graph input, or already initialized intermediate tensor
-      if (model.CheckIfTensorAlreadyExist(fNInput) == false){
-        throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model");
-      }
-      if (model.CheckIfTensorAlreadyExist(fNRepeats) == false){
-        throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model");
-      }
-      fShapeInput=model.GetTensorShape(fNInput);
+      if (model.CheckIfTensorAlreadyExist(fNInput) == false)
+         throw std::runtime_error("TMVA SOFIE Tile Op Input Tensor is not found in model");
+      if (model.CheckIfTensorAlreadyExist(fNRepeats) == false)
+         throw std::runtime_error("TMVA SOFIE Tile Op Repeats Tensor is not found in model");
 
-      // if repeats vector is not initialized we cannot deduce shape of output
-      // not support for time being this case
-      if (!model.IsInitializedTensor(fNRepeats)) {
+      fShapeInput = model.GetTensorShape(fNInput);
+
+      if (!model.IsInitializedTensor(fNRepeats))
          throw std::runtime_error("TMVA SOFIE Tile Op: non-initialized repeats input is not supported");
-      }
 
-      // Retrieve the data pointer for the repeats tensor
-      auto repptr = model.GetInitializedTensorData(fNRepeats);
-      // Cast the raw pointer to the appropriate type (size_t*)
+      auto repptr       = model.GetInitializedTensorData(fNRepeats);
       auto repeats_data = static_cast<int64_t*>(repptr.get());
-      if (repeats_data == nullptr) {
-        throw std::runtime_error("Failed to retrieve the data for the repeats tensor.");
-      }
-      // Get the shape of the repeats tensor to determine the number of elements
+      if (repeats_data == nullptr)
+         throw std::runtime_error("TMVA SOFIE Tile Op: failed to retrieve repeats tensor data");
+
       auto repeats_shape = model.GetTensorShape(fNRepeats);
-      // Ensure the repeats tensor is 1D and get the number of elements
-      if (repeats_shape.size() != 1) {
-         throw std::runtime_error("Repeats tensor is not 1D.");
-      }
-      size_t num_elements = repeats_shape[0];
-      // Convert the data to a vector of size_t
-      std::vector<size_t> repeats_vector(num_elements);
-      std::copy(repeats_data, repeats_data + num_elements, repeats_vector.begin());
+      if (repeats_shape.size() != 1)
+         throw std::runtime_error("TMVA SOFIE Tile Op: repeats tensor must be 1D");
 
+      size_t num_elements = repeats_shape[0];
 
-      fShapeY = ShapeInference({fShapeInput,repeats_vector})[0];
+      // Save repeats if known at generation time so the GPU kernel can bake
+      // fShapeInput[d] directly without needing a runtime repeats pointer.
+      // fRepeats is left empty if repeats are not initialized (future case),
+      // which will cause the kernel to use the runtime repeats pointer path.
+      fRepeats.resize(num_elements);
+      std::copy(repeats_data, repeats_data + num_elements, fRepeats.begin());
+      if (fRepeats.size()){
+         model.RemoveInitializedTensor(fNRepeats);
+      }
+      fShapeY = ShapeInference({fShapeInput, fRepeats})[0];
 
       model.AddIntermediateTensor(fNY, model.GetTensorType(fNInput), fShapeY);
 
       if (model.Verbose())
-         std::cout <<  "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput) << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY)
-            << " given repeats " << ConvertShapeToString(repeats_vector) << std::endl;
+         std::cout << "Tile: " << fNInput << " " << ConvertShapeToString(fShapeInput)
+                   << " -> " << fNY << " with shape " << ConvertShapeToString(fShapeY)
+                   << " given repeats " << ConvertShapeToString(fRepeats) << std::endl;
    }
 
    std::string Generate(std::string OpName) override {
       OpName = "op_" + OpName;
-      if (fShapeInput.empty() || fShapeY.empty()) {
-            throw std::runtime_error("TMVA SOFIE Tile Op called to Generate without being initialized first");
+      if (fShapeInput.empty() || fShapeY.empty())
+         throw std::runtime_error("TMVA SOFIE Tile Op called to Generate without being initialized first");
+
+      std::stringstream out;
+      std::string input   = "tensor_" + fNInput;
+      std::string output  = "tensor_" + fNY;
+      std::string repeats = "tensor_" + fNRepeats;
+
+      out << "///-------- Tile operator\n";
+      out << "{\n";
+
+      out << SP << "const int input_shape[" << fShapeInput.size() << "] = {";
+      for (size_t i = 0; i < fShapeInput.size(); ++i) {
+         if (i > 0) out << ", ";
+         out << fShapeInput[i];
+      }
+      out << "};\n";
+
+      out << SP << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n";
+      out << SP << "int s = 1;\n";
+
+      // Read repeats from the tensor at runtime so the generated code remains
+      // correct even if repeats become a runtime input/intermediate in the future
+      out << SP << "for (int i = " << fShapeInput.size() - 1 << "; i >= 0; i--) {\n";
+      out << SP << SP << "int r = " << repeats << "[i];\n";
+      out << SP << SP << "int i_offset = 0, o_offset = 0;\n";
+      out << SP << SP << "s = s * input_shape[i];\n";
+      out << SP << SP << "if (i == " << fShapeInput.size() - 1 << ") {\n";
+      out << SP << SP << SP << "for (int j = 0; j < inputLength / s; j++) {\n";
+      out << SP << SP << SP << SP << "for (int k = 0; k < r; k++) {\n";
+      out << SP << SP << SP << SP << SP << "std::copy(" << input << " + i_offset, "
+                                        << input << " + i_offset + s, "
+                                        << output << " + o_offset);\n";
+      out << SP << SP << SP << SP << SP << "o_offset += s;\n";
+      out << SP << SP << SP << SP << "}\n";
+      out << SP << SP << SP << SP << "i_offset += s;\n";
+      out << SP << SP << SP << "}\n";
+      out << SP << SP << "} else {\n";
+      out << SP << SP << SP << "for (int j = inputLength / s - 1; j >= 0; j--) {\n";
+      out << SP << SP << SP << SP << "o_offset = j * s * r;\n";
+      out << SP << SP << SP << SP << "i_offset = j * s;\n";
+      out << SP << SP << SP << SP << "for (int k = 0; k < r; k++) {\n";
+      out << SP << SP << SP << SP << SP << "std::copy(" << output << " + i_offset, "
+                                        << output << " + i_offset + s, "
+                                        << output << " + o_offset);\n";
+      out << SP << SP << SP << SP << SP << "o_offset += s;\n";
+      out << SP << SP << SP << SP << "}\n";
+      out << SP << SP << SP << "}\n";
+      out << SP << SP << "}\n";
+      out << SP << SP << "s *= r;\n";
+      out << SP << SP << "inputLength *= r;\n";
+      out << SP << "}\n";
+      out << "}\n";
+      return out.str();
+   }
+
+   std::string Generate_GPU_Kernel_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeInput.empty() || fShapeY.empty())
+         throw std::runtime_error("TMVA SOFIE Operator Tile called to Generate without being initialized first");
+
+      const std::size_t D = fShapeInput.size();
+
+      auto inputStrides  = UTILITY::ComputeStrideFromShape(fShapeInput);
+      auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeY);
+      std::size_t totalElements = ConvertShapeToLength(fShapeY);
+
+      // If fRepeats is populated, repeats were known at generation time and
+      // we can bake fShapeInput[d] as literals — no runtime repeats pointer needed.
+      // If fRepeats is empty (future: runtime repeats), pass repeats as a kernel arg.
+      bool repeatsKnown = !fRepeats.empty();
+
+      std::string kname = "TileKernel_" + opName;
+
+      std::string op;
+      op  = "\n//------ TILE_KERNEL_ALPAKA\n";
+      op += SP + "struct " + kname + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(\n";
+      op += SP + SP + SP + "TAcc const& acc,\n";
+      op += SP + SP + SP + "T const* __restrict__ input,\n";
+      op += SP + SP + SP + "T* __restrict__ output,\n";
+      if (!repeatsKnown)
+         op += SP + SP + SP + "int64_t const* __restrict__ repeats,\n";
+      op += SP + SP + SP + "std::size_t const totalElements) const {\n\n";
+
+      op += SP + SP + SP + "auto const global_thread_idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + "if (global_thread_idx >= totalElements) return;\n";
+      op += SP + SP + SP + "auto const grid_thread_extent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc)[0];\n\n";
+
+      op += SP + SP + SP + "for (std::size_t elem_idx = global_thread_idx; elem_idx < totalElements; elem_idx += grid_thread_extent) {\n\n";
+
+      // Decompose output linear index — output strides always compile-time
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + "std::size_t const out_" + std::to_string(d)
+             + " = (elem_idx / " + std::to_string(outputStrides[d]) + "u) % "
+             + std::to_string(fShapeY[d]) + "u;\n";
+      }
+      op += "\n";
+
+      // Input index: fShapeInput[d] is always a compile-time constant since
+      // it is the input tensor shape, never runtime-variable.
+      // When repeatsKnown, we bake it directly as a literal.
+      // When not repeatsKnown (future), we still use fShapeInput[d] as a
+      // literal for the % — repeats pointer is only needed if fShapeY is dynamic.
+      op += SP + SP + SP + SP + "std::size_t const input_idx =\n";
+      for (std::size_t d = 0; d < D; ++d) {
+         op += SP + SP + SP + SP + SP
+             + "(out_" + std::to_string(d) + " % " + std::to_string(fShapeInput[d]) + "u)"
+             + " * " + std::to_string(inputStrides[d]) + "u";
+         op += (d + 1 < D) ? " +\n" : ";\n\n";
       }
 
-      //size_t input_length = ConvertShapeToLength(fShapeInput);
-      //size_t output_length = ConvertShapeToLength(fShapeY);
+      op += SP + SP + SP + SP + "output[elem_idx] = input[input_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + "}\n";
+      op += SP + "};\n";
 
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      std::string kname = "TileKernel_" + opName;
+      return SP + kname + " tileKernel_" + opName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string opName) override {
+      opName = "op_" + opName;
+      if (fShapeInput.empty() || fShapeY.empty())
+         throw std::runtime_error("TMVA SOFIE Operator Tile called to Generate without being initialized first");
+
+      bool repeatsKnown = !fRepeats.empty();
+      std::size_t totalElements = ConvertShapeToLength(fShapeY);
+      std::string kname = "tileKernel_" + opName;
+
+      // Build argument list once, reused for both getValidWorkDiv and exec
+      std::string args =
+          "alpaka::getPtrNative(deviceBuf_" + fNInput + "), "
+          + "alpaka::getPtrNative(deviceBuf_" + fNY + ")";
+      if (!repeatsKnown)
+         args += ", alpaka::getPtrNative(deviceBuf_" + fNRepeats + ")";
+      args += ", static_cast<Idx>(" + std::to_string(totalElements) + ")";
 
       std::stringstream out;
-      std::string input = "tensor_" + fNInput;
-      std::string output = "tensor_" + fNY;
-      out << "///-------- Tile operator\n";
-      out << "{\n"; // add scope to re-use same names
-      out << "const int input_shape[" << fShapeInput.size() << "] = " << ConvertShapeToString(fShapeInput) << ";\n";
-
-      out << "int inputLength = " << ConvertShapeToLength(fShapeInput) << ";\n";
-      out << "int s = 1;\n";
-      // loop from inverse dim order
-      out << "for (int i = " << fShapeInput.size()-1 << "; i >=0; i--) {\n";
-      out << SP << "int r = tensor_" << fNRepeats << "[i];\n";
-      // we cannot exclude case where repeats=1 since we need offset
-      //out << SP << "if (r == 1 && i < " << fShapeInput.size()-1 <<  ") continue;\n";
-      out << SP << "int i_offset = 0, o_offset = 0;\n";
-      out << SP << "s = s * input_shape[i];\n";
-      // case we have first copy
-      out << SP << "if (i == " << fShapeInput.size()-1 <<  ") {\n";
-      out << SP << SP <<  "for (int j = 0; j < inputLength/s ; j++) {\n";
-      out << SP << SP << SP << "for (int k = 0; k < r ; k++) {\n";
-      out << SP << SP << SP << SP << "std::copy(" << input << "+ i_offset, "
-                                    << input << "+ i_offset + s, " << output << "+ o_offset);\n";
-      out << SP << SP << SP << SP << "o_offset += s;\n";
-      out << SP << SP << SP << "}\n"; // end k loop
-      out << SP << SP << SP << "i_offset += s;\n";
-      out << SP << SP << "}\n"; // end j loop
-      out << SP << "} else {\n";  // second copy we do from output to output
-      // and we need to loop on j from reverse order to avoir re-writing in output tensor
-      out << SP << SP << "for (int j = inputLength/s - 1 ; j>=0; j--) {\n";
-      out << SP << SP << SP << "o_offset = j*s*r;\n";
-      out << SP << SP << SP << "i_offset = j*s;\n";
-      out << SP << SP << SP << "for (int k = 0; k < r ; k++) {\n";
-      out << SP << SP << SP << SP << "std::copy(" << output << "+ i_offset, "
-                                    << output << "+ i_offset + s, " << output << "+ o_offset);\n";
-      out << SP << SP << SP << SP << "o_offset += s;\n";
-      out << SP << SP << SP << "}\n"; // end k loop
-      out << SP << SP << "}\n"; // end j loop
-      out << SP << "}\n"; // end if
-      out << SP << "s *= r;\n";
-      out << SP << "inputLength *= r;\n";
-      out << "}\n"; // end i loop
-      out << "}\n";  // end of scope
+      out << "\n//------ TILE_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_" << opName << " = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"   << opName << " = Vec::all(Idx{" << totalElements << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << opName
+          << " = {elementsPerGrid_" << opName << ", elementsPerThread_" << opName << "};\n";
+      out << SP << "auto const workDiv_" << opName << " = alpaka::getValidWorkDiv(kernelCfg_" << opName
+          << ", devAcc, " << kname << ", " << args << ");\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << opName
+          << ", " << kname << ", " << args << ");\n";
+      out << SP <<"alpaka::wait(queue);\n";
       return out.str();
    }
+
 };
 
 }//SOFIE
 
-
 #endif //SOFIE_ROPERATOR_Tile
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx
index 11c40bb..de33544 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Transpose.hxx
@@ -165,6 +165,62 @@ public:
       return out.str();
    }
 
+   std::string Generate_GPU_Kernel_ALPAKA(std::string OpName) {
+      std::string op;
+      OpName = "op_" + OpName;
+      op = "\n//------ TRANSPOSE_KERNEL_ALPAKA\n";
+      op += SP + "struct TransposeKernel_" + OpName + " {\n";
+      op += SP + SP + "template<typename TAcc, typename T>\n";
+      op += SP + SP + "ALPAKA_FN_ACC void operator()(TAcc const& acc, T const* input, T* output,";
+      op += "const std::size_t totalElements) const {\n";
+      op += SP + SP + SP + SP + "auto const idx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc)[0];\n";
+      op += SP + SP + SP + SP + "if(idx >= totalElements) return;\n";
+      op += SP + SP + SP + SP + "std::size_t input_idx = 0;\n";
+      op += SP + SP + SP + SP + "std::size_t remaining = idx;\n";
+      op += SP + SP + SP + SP + "std::size_t coord;\n";
+
+      auto inputStrides  = UTILITY::ComputeStrideFromShape(fShapeData);
+      auto outputStrides = UTILITY::ComputeStrideFromShape(fShapeOutput);
+
+      for (size_t k = 0; k < fShapeData.size(); k++) {
+         op += SP + SP + SP + SP + "coord = remaining / "
+               + std::to_string(outputStrides[k]) + "u;\n";
+         op += SP + SP + SP + SP + "remaining = remaining - coord * "
+               + std::to_string(outputStrides[k]) + "u;\n";
+         op += SP + SP + SP + SP + "input_idx += coord * "
+               + std::to_string(inputStrides[fAttrPerm[k]]) + "u;\n";
+      }
+
+      op += SP + SP + SP + SP + "output[idx] = input[input_idx];\n";
+      op += SP + SP + SP + "}\n";
+      op += SP + SP + SP + "};\n";
+
+      return op;
+   }
+
+   std::string Generate_GPU_Kernel_Definitions_ALPAKA(std::string OpName) override {
+      return SP + "TransposeKernel_op_" + OpName + " transposeKernel_" + OpName + ";\n";
+   }
+
+   std::string Generate_GPU_ALPAKA(std::string OpName) override {
+      if (fShapeOutput.empty()) {
+         throw std::runtime_error("TMVA SOFIE Operator Transpose called to Generate without being initialized first");
+      }
+      std::stringstream out;
+      auto length = ConvertShapeToLength(fShapeOutput);
+
+      out << "\n//------ TRANSPOSE_GPU_ALPAKA\n";
+      out << SP << "auto const elementsPerThread_"<<fNOutput<<" = Vec::all(static_cast<Idx>(1));\n";
+      out << SP << "auto const elementsPerGrid_"<<fNOutput<<" = Vec::all(Idx{"<< length << "});\n";
+      out << SP << "alpaka::KernelCfg<Acc> const kernelCfg_" << fNOutput << " = {elementsPerGrid_" << fNOutput << ", elementsPerThread_" << fNOutput << "};\n";
+      out << SP << "auto const workDiv_" << fNOutput << " = alpaka::getValidWorkDiv(kernelCfg_" << fNOutput << ", devAcc, transposeKernel_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fNData
+         << "), alpaka::getPtrNative(deviceBuf_" << fNOutput << "), static_cast<Idx>(" << length << "));\n";
+      out << SP << "alpaka::exec<Acc>(queue, workDiv_" << fNOutput
+         << ", transposeKernel_" << OpName << ", alpaka::getPtrNative(deviceBuf_" << fNData
+         << "), alpaka::getPtrNative(deviceBuf_" << fNOutput << "), static_cast<Idx>(" << length << "));\n";
+      out << SP <<"alpaka::wait(queue);\n";
+      return out.str();
+   }
 
 };
 
diff --git a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx
index 28ac093..2a55700 100644
--- a/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx
+++ b/src/SOFIE_core/inc/SOFIE/ROperator_Where.hxx
@@ -7,11 +7,8 @@
 
 #include <sstream>
 
-
 namespace SOFIE{
 
-
-
 template<typename T>
 class ROperator_Where final : public ROperator{
 private:
@@ -104,7 +101,7 @@ public:
             if (model.IsInitializedTensor(fNA)) {
                auto data = model.GetInitializedTensorData(fNA);
                std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeA, fShapeY),
+                  UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeA, fShapeY),
                   std::default_delete<T[]>());
                // Update the data and the shape of A
                model.AddConstantTensor(fNBroadcastedA, model.GetTensorType(fNA), fShapeY, broadcastedData);
@@ -120,7 +117,7 @@ public:
             if (model.IsInitializedTensor(fNB)) {
                auto data = model.GetInitializedTensorData(fNB);
                std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeB, fShapeY),
+                  UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeB, fShapeY),
                   std::default_delete<T[]>());
                // do not update tensor B but add broadcasted one (since it can be input to some other operators)
                model.AddConstantTensor(fNBroadcastedB, model.GetTensorType(fNB), fShapeY, broadcastedData);
@@ -136,7 +133,7 @@ public:
             if (model.IsInitializedTensor(fNC)) {
                auto data = model.GetInitializedTensorData(fNC);
                std::shared_ptr<void> broadcastedData(
-                  UTILITY::UnidirectionalBroadcast<T>(static_cast<T *>(data.get()), fShapeC, fShapeY),
+                  UTILITY::UnidirectionalBroadcast(static_cast<T *>(data.get()), fShapeC, fShapeY),
                   std::default_delete<T[]>());
                // do not update tensor C but add broadcasted one (since it can be input to some other operators)
                model.AddConstantTensor(fNBroadcastedC, model.GetTensorType(fNC), fShapeY, broadcastedData);
@@ -150,32 +147,86 @@ public:
          fShapeY = fShapeA;
       }
       // check case of constant  output (if all inputs are defined)
-      if (model.IsInitializedTensor(fNA) && model.IsInitializedTensor(fNB) && model.IsInitializedTensor(fNC)) {
-         std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
-         std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
+      if (model.IsInitializedTensor(fNC)) {
+
          std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC;
-         auto dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
-         auto dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
          auto dataC = static_cast<bool *>(model.GetInitializedTensorData(nameC).get());
-         std::vector<T> dataY(ConvertShapeToLength(fShapeY));
-         for (size_t i = 0; i < dataY.size(); i++)
-             dataY[i] = (dataC[i]) ? dataA[i] : dataB[i];
-         model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
-         // flag tensors to not be written in a file
-         model.SetNotWritableInitializedTensor(nameA);
-         model.SetNotWritableInitializedTensor(nameB);
          model.SetNotWritableInitializedTensor(nameC);
+         T * dataA = nullptr;
+         T * dataB = nullptr;
+         std::vector<Dim> shapeDataA;
+         std::vector<Dim> shapeDataB;
+         if (model.IsInitializedTensor(fNA)) {
+             std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
+             dataA = static_cast<T *>(model.GetInitializedTensorData(nameA).get());
+            // flag tensors to not be written in a file
+            model.SetNotWritableInitializedTensor(nameA);
+         } else if (model.IsShapeTensor(fNA))
+            shapeDataA = model.GetShapeTensorValues(fNA);
+         if (model.IsInitializedTensor(fNB)) {
+            std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
+            dataB = static_cast<T *>(model.GetInitializedTensorData(nameB).get());
+            model.SetNotWritableInitializedTensor(nameB);
+         } else if (model.IsShapeTensor(fNB))
+            shapeDataB = model.GetShapeTensorValues(fNB);
 
-         fIsOutputConstant = true;
-         if (model.Verbose())
+         std::vector<T> dataY;
+         std::vector<Dim> shapeDataY;
+
+         bool isOutputConstantTensor = true;
+         if (dataA && dataB) {
+            dataY.resize(ConvertShapeToLength(fShapeY));
+            for (size_t i = 0; i < dataY.size(); i++)
+                dataY[i] = (dataC[i]) ? dataA[i] : dataB[i];
+         }
+         else if (dataA && shapeDataB.size()>0 ) {
+            shapeDataY.resize(ConvertShapeToLength(fShapeY));
+            for (size_t i = 0; i < shapeDataY.size(); i++) {
+               shapeDataY[i] = (dataC[i]) ? Dim{size_t(dataA[i])} : shapeDataB[i];
+               isOutputConstantTensor &= !shapeDataY[i].isParam;
+            }
+         }
+         else if (dataB && shapeDataA.size()>0 ) {
+            shapeDataY.resize(ConvertShapeToLength(fShapeY));
+            for (size_t i = 0; i < shapeDataY.size(); i++) {
+               shapeDataY[i] = (dataC[i]) ? shapeDataB[i] : Dim{size_t(dataB[i])};
+               isOutputConstantTensor &= !shapeDataY[i].isParam;
+            }
+         }
+         else if (shapeDataB.size() > 0  && shapeDataA.size()>0 ) {
+            shapeDataY.resize(ConvertShapeToLength(fShapeY));
+            for (size_t i = 0; i < shapeDataY.size(); i++) {
+               shapeDataY[i] = (dataC[i]) ? shapeDataA[i] : shapeDataB[i];
+               isOutputConstantTensor &= !shapeDataY[i].isParam;
+            }
+         }
+         fIsOutputConstant = true;  // this contains both case constant tensor output ans shape tensor output
+         if (isOutputConstantTensor && dataY.empty()) {
+            dataY.resize(shapeDataY.size());
+            for (size_t i = 0; i < shapeDataY.size(); i++)
+               dataY[i] = static_cast<T>(shapeDataY[i].dim);
+         }
+         if (dataY.size() > 0)
+            model.AddConstantTensor<T>(fNY, fShapeY, dataY.data());
+         else if (shapeDataY.size() > 0 )
+           model.AddShapeTensor(fNY, shapeDataY, fShapeY.size() == 0);
+         else {
+            fIsOutputConstant = false;
+         }
+         if (fIsOutputConstant && model.Verbose())
             std::cout << "Where op ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << " : "
-               << ConvertValuesToString(dataY) << std::endl;
-         
+               << ((dataY.size() > 0) ? ConvertValuesToString(dataY) : ConvertDimShapeToString(shapeDataY) )
+               << ((dataY.size() > 0) ? " (constant)" : " (shape)") << std::endl;
+
          // output is a constant tensor
-         fOutputTensorNames.pop_back();
+         if (fIsOutputConstant) fOutputTensorNames.pop_back();
       }
-      else {
+      if (!fIsOutputConstant) {
         model.AddIntermediateTensor(fNY, model.GetTensorType(fNA), fShapeY);
+        if (model.Verbose())
+            std::cout << "Where op " << " condition : " << fNC << "  " << ConvertShapeToString(fShapeC) <<
+                   " X " << fNA << "  " << ConvertShapeToString(fShapeA) << " Y " <<  fNB << "  " << ConvertShapeToString(fShapeB)
+                   << " ---> " << fNY << "  " << ConvertShapeToString(fShapeY) << std::endl;
       }
    }
 
@@ -184,52 +235,51 @@ public:
       return out.str();
    }
 
-   std::string Generate(std::string OpName) override {
+   std::string Generate(std::string opName) override {
 
       if (fIsOutputConstant) return "";
 
-      OpName = "op_" + OpName;
+      opName = "op_" + opName;
 
       if (fShapeY.empty()) {
          throw std::runtime_error("TMVA SOFIE Where Op called to Generate without being initialized first");
       }
       std::stringstream out;
-      out << SP << "\n//-------- Where   \n";
+      out << SP << "\n//-------- Where " << opName << " --> " << ConvertShapeToString(fShapeY) << "\n";
       size_t length = ConvertShapeToLength(fShapeY);
       std::string typeName = TensorType<T>::Name();
       // Broadcast A if it's uninitialized
       if (fShapeA != fShapeY) {
          out << SP << "// Broadcasting uninitialized tensor " << fNA << "\n";
          //out << SP << "{\n";
-         out << SP  << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY)
-                         << ", fTensor_" << fNBroadcastedA << ");\n";
+         out << SP  << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNA << ", " << ConvertShapeToString(fShapeA) << ", " << ConvertShapeToString(fShapeY)
+                         << ", tensor_" << fNBroadcastedA << ");\n";
       }
       // Broadcast B if it's uninitialized
       if (fShapeB != fShapeY) {
          out << SP << "// Broadcasting uninitialized tensor " << fNB << "\n";
          //out << SP << "{\n";
-         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast<" << typeName << ">(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY)
-                   << ", fTensor_" << fNBroadcastedB << ");\n";
+         out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tensor_" << fNB << ", " << ConvertShapeToString(fShapeB) << ", " << ConvertShapeToString(fShapeY)
+                   << ", tensor_" << fNBroadcastedB << ");\n";
       }
        // Broadcast C if it's uninitialized
       if (fShapeC != fShapeY) {
          // special case if C is an input tensor
          if (fIsInputBoolTensor) {
             size_t inputLength = ConvertShapeToLength(fShapeC);
-            out << SP << "std::vector<bool> fTensor_" << fNC << "(tensor_" << fNC <<  ", tensor_" << fNC << " + " << inputLength << ");\n";
+            out << SP << "std::vector<std::uint8_t> tmp_tensor_" << fNC << "(tensor_" << fNC <<  ", tensor_" << fNC << " + " << inputLength << ");\n";
          }
          out << SP << "// Broadcasting uninitialized tensor " << fNC << "\n";
          //out << SP << "{\n";
-         // for boolean we need to pass vector<bool> and use the non-template version of the function
-         out << SP << "SOFIE::UTILITY::UnidirectionalBroadcast(fTensor_" << fNC << ", " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY)
-                   << ", fTensor_" << fNBroadcastedC << ");\n";
+         out << SP << "TMVA::Experimental::SOFIE::UTILITY::UnidirectionalBroadcast(tmp_tensor_" << fNC << ".data(), " << ConvertShapeToString(fShapeC) << ", " << ConvertShapeToString(fShapeY)
+                   << ", tensor_" << fNBroadcastedC << ");\n";
       }
       std::string nameA = fNBroadcastedA.empty()? fNA : fNBroadcastedA;
       std::string nameB = fNBroadcastedB.empty()? fNB : fNBroadcastedB;
       std::string nameC = fNBroadcastedC.empty()? fNC : fNBroadcastedC;
       out << SP << "for (size_t id = 0; id < " << length << " ; id++){\n";
-      // get output tensor applying condition (note we need to use directly the vector<bool> since v.data(),  i.e the data pointer, does not exist)
-      out << SP << SP << "tensor_" << fNY << "[id] = "  << "(fTensor_" << nameC << "[id]) ? tensor_"
+      // get output tensor applying condition
+      out << SP << SP << "tensor_" << fNY << "[id] = "  << "tensor_" << nameC << "[id] ? tensor_"
                                << nameA << "[id] : tensor_" + nameB + "[id];\n";
       out << SP << "}\n";
       return out.str();
@@ -239,5 +289,4 @@ public:
 
 }//SOFIE
 
-
-#endif //SOFIE_ROperator_Where
+#endif //TMVA_SOFIE_ROperator_Where
diff --git a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx b/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx
index d183052..d59eee8 100644
--- a/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx
+++ b/src/SOFIE_core/inc/SOFIE/SOFIE_common.hxx
@@ -1,7 +1,7 @@
 #ifndef SOFIE_SOFIE_COMMON
 #define SOFIE_SOFIE_COMMON
 
-#include "TMVA/RTensor.hxx"
+#include "SOFIE/RTensor.hxx"
 
 #include "ROOT/RSpan.hxx"
 
@@ -21,13 +21,10 @@
 #include <cassert>
 #include <limits>
 
-
-namespace SOFIE{
-
-//typedef RTensor tensor_t;
+namespace SOFIE {
 
 enum class ETensorType{
-   UNDEFINED = 0, FLOAT = 1, UNINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive
+   UNDEFINED = 0, FLOAT = 1, UINT8 = 2, INT8 = 3, UINT16 = 4, INT16 = 5, INT32 = 6, INT64 = 7, STRING = 8, BOOL = 9, //order sensitive
     FLOAT16 = 10, DOUBLE = 11, UINT32 = 12, UINT64 = 13, COMPLEX64 = 14, COMPLEX28 = 15, BFLOAT16 = 16
 };
 
@@ -39,7 +36,7 @@ constexpr size_t GetTypeSize(ETensorType type) {
     switch (type) {
         case ETensorType::FLOAT:     return sizeof(float);
         case ETensorType::DOUBLE:    return sizeof(double);
-        case ETensorType::UNINT8:     return sizeof(uint8_t);
+        case ETensorType::UINT8:     return sizeof(uint8_t);
         case ETensorType::INT8:      return sizeof(int8_t);
         case ETensorType::UINT16:    return sizeof(uint16_t);
         case ETensorType::INT16:     return sizeof(int16_t);
@@ -58,6 +55,9 @@ typedef std::int64_t int_t;
 std::string ConvertTypeToString(ETensorType type);
 ETensorType ConvertStringToType(std::string type);
 
+// find if a string represents a number
+bool IsInteger(const std::string & s);
+
 struct Dim{
    bool isParam = false;
    size_t dim = 0;
@@ -67,16 +67,42 @@ struct Dim{
    Dim() {}
 
    // constructor for a parametric dimension with the option to pass a default dim value
-   Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p) {}
+   // We use -1 for dim to indicate that the param dimension is an expression (e.g. "d1+d2")
+   // in case the string represents a number make Dim not parametric
+   Dim(const std::string & p, size_t d = 0) : isParam(true), dim(d), param(p)
+   {
+      if (IsInteger(p)) {
+            isParam = false;
+            dim = std::stoi(p);
+      }
+   }
 
    // constructor for a non-parametric dimension
    Dim(size_t d) : dim(d) {}
 
    std::string GetVal() const {
-      return (isParam) ? param : std::to_string(dim);
+      // cast to int64_t for negative shape values
+      return (isParam) ? param : std::to_string(static_cast<int64_t>(dim));
+   }
+
+   std::ostream& operator<< (std::ostream& os) const {
+      os << GetVal();
+      return os;
+   }
+
+   bool operator==(const Dim& rhs) const {
+       return (isParam && rhs.isParam) ? param == rhs.param : dim == rhs.dim;
+   }
+   bool operator!=(const Dim& rhs) const {
+       return !(*this == rhs);
    }
 };
 
+//bool operator==(const Dim& lhs, const Dim& rhs);
+inline std::ostream & operator<< (std::ostream &os, const Dim &d) {
+   os << d.GetVal();
+   return os;
+}
 
 struct InputTensorInfo{
    ETensorType type;
@@ -93,6 +119,18 @@ struct DynamicTensorInfo{
    std::vector<Dim> shape;
 };
 
+// template traits for Tensor Shape
+template <typename T>
+struct TensorShape {};
+template<>
+struct TensorShape<Dim> {
+   static bool IsDim() { return true; }
+};
+template<>
+struct TensorShape<size_t> {
+   static bool IsDim() { return false; }
+};
+
 // template traits for Tensor type
 template <typename T>
 struct TensorType {};
@@ -120,6 +158,18 @@ template<>
 struct TensorType<uint64_t> {
    static const std::string Name() { return "uint64_t"; }
 };
+template<>
+struct TensorType<bool> {
+   static const std::string Name() { return "bool"; }
+};
+template<>
+struct TensorType<int8_t> {
+   static const std::string Name() { return "int8_t"; }
+};
+template<>
+struct TensorType<uint8_t> {
+   static const std::string Name() { return "uint8_t"; }
+};
 
 struct TensorMemoryInfo {
    std::string_view tensor_name;
@@ -148,19 +198,17 @@ struct MemoryPoolInfo {
    std::map<size_t, size_t> available_stack;
 };
 
-std::vector<Dim> ConvertShapeToDim(std::vector<size_t> shape);
+std::vector<Dim> ConvertShapeToDim(const std::vector<size_t> & shape);
+
+std::vector<size_t> ConvertShapeToInt(const std::vector<Dim> & shape);
 
-std::vector<size_t> ConvertShapeToInt(std::vector<Dim> shape);
+std::size_t ConvertShapeToLength(const std::vector<size_t> & shape);
 
-std::size_t ConvertShapeToLength(std::vector<size_t> shape);
+std::string ConvertShapeToString(const std::vector<size_t> & shape);
+std::string ConvertDimShapeToString(const std::vector<Dim> & shape);
 
-std::string ConvertShapeToString(std::vector<size_t> shape);
-std::string ConvertDynamicShapeToString(std::vector<Dim> shape);
-// std::string ConvertShapeToString(std::vector<Dim> shape) {
-//    return ConvertDynamicShapeToString(shape);
-// }
+std::string ConvertDimShapeToLength(const std::vector<Dim> & shape);
 
-std::string ConvertDynamicShapeToLength(std::vector<Dim> shape);
 
 template<class T>
 std::string ConvertValToString(T value) {
@@ -179,8 +227,11 @@ std::string ConvertValuesToString(size_t n, const T * data) {
    ret << "{ ";
    for (size_t i = 0; i < n; i++) {
       if (std::is_floating_point_v<T>)
-         ret << std::setprecision(std::numeric_limits<T>::max_digits10);
-      ret << data[i];
+         ret << std::setprecision(std::numeric_limits<T>::max_digits10) << data[i];
+      else
+         // cast in case of boolean (int8)
+         ret << (int64_t) data[i];
+
       if (i < n-1) ret << ", ";
    }
    ret << "}";
@@ -206,8 +257,14 @@ public:
    bool IsConstantTensor() const { return fConstant;}
    // query if tensor needs to be written in a weight file. Constant tensors are not written in a file
    bool IsWeightTensor() const { return !fConstant && !fIsNotWritable;}
+   // check if a Tensor is Writable (need to be written in the file or in the generated code (e.g. as a constant tensor)
+   // if an initialized tensors is used in a constant operator at compile time does not need to be written and can be omitted in
+   // the generated code
+   bool IsNotWritable() const { return fIsNotWritable; }
    // set not writable initialized tensors - i.e. tensor that must not be written in a file
    void SetNotWritable() { fIsNotWritable = true;}
+   // set as constant (needed for non-float initialized tensors)
+   void SetConstant() { fConstant = true;}
 
    template <class T = void>
    T const *data() const
@@ -230,7 +287,7 @@ public:
       case ETensorType::INT64: fSize *= sizeof(int64_t); break;
       case ETensorType::BOOL: fSize *= sizeof(bool); break;
       default:
-         throw std::runtime_error("TMVA::SOFIE doesn't yet supports serialising data-type " +
+         throw std::runtime_error("SOFIE doesn't yet supports serialising data-type " +
                                   ConvertTypeToString(fType));
       }
       fPersistentData = static_cast<char *>(fData.get());
@@ -271,7 +328,7 @@ private:
 template <typename T>
 ETensorType GetTemplatedType(T /*obj*/ ){
    if (std::is_same<T, float>::value) return ETensorType::FLOAT;
-   if (std::is_same<T, uint8_t>::value) return ETensorType::UNINT8;
+   if (std::is_same<T, uint8_t>::value) return ETensorType::UINT8;
    if (std::is_same<T, int8_t>::value) return ETensorType::INT8;
    if (std::is_same<T, uint16_t>::value) return ETensorType::UINT16;
    if (std::is_same<T, int16_t>::value) return ETensorType::INT16;
@@ -287,6 +344,12 @@ ETensorType GetTemplatedType(T /*obj*/ ){
 }
 
 namespace UTILITY{
+
+
+
+// clean operator and tensor names
+std::string Clean_name(std::string input_tensor_name);
+
 // Check if two shapes are equal
 bool AreSameShape(const std::vector<size_t>&, const std::vector<size_t>&);
 bool AreSameShape(const std::vector<size_t>&, const std::vector<Dim>&);
@@ -296,17 +359,21 @@ bool AreSameShape(const std::vector<Dim>&, const std::vector<Dim>&);
 // Multidirectional broadcast a list of tensors to the same shape
 std::vector<size_t> MultidirectionalBroadcastShape(std::vector<std::vector<size_t>>);
 
-// Unidirectional broadcast two shapes to the same shape
-std::vector<size_t> UnidirectionalBroadcastShape(std::vector<size_t>, std::vector<size_t>);
+// Multidirectional broadcast two shapes to the same shape
+
+std::pair<int, std::vector<size_t>> MultidirectionalBroadcastShape(std::vector<size_t> &, std::vector<size_t> &);
+std::vector<size_t> UnidirectionalBroadcastShape(std::vector<size_t> &, std::vector<size_t> &);
+
+std::pair<int, std::vector<Dim>> MultidirectionalBroadcastShape(std::vector<Dim> &, std::vector<Dim> &);
+
 
-std::string Clean_name(std::string input_tensor_name);
 
 template<typename T>
 T* BroadcastConvBias(const T* data, const size_t channel, const std::vector<size_t>& targetShape) {
    size_t size = targetShape.size();
    if (targetShape[1] != channel) {
       std::stringstream ss;
-      ss << "TMVA::SOFIE - Error broadcasting Conv Bias of shape {";
+      ss << "SOFIE - Error broadcasting Conv Bias of shape {";
       ss << std::to_string(channel);
       ss << "} to ";
       ss << ConvertShapeToString(targetShape);
@@ -343,16 +410,14 @@ T* BroadcastConvBias(const T* data, const size_t channel, const std::vector<size
 // Broadcast a tensor from shape to targetShape according to numpy broadcasting rules
 // See more at https://numpy.org/doc/stable/user/basics.broadcasting.html
 // and https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md .
-template<typename T, class ConstContT = std::span<const T>, class ContT = std::span<T> >
-void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, ContT broadcastedData) {
+template<typename T, class ConstContT = std::span<const T>>
+void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, T *broadcastedData) {
    // Size of the shapes (tensor input here have shapes with same sizes, we have already added the needed ones )
    size_t size = shape.size();
    // Current length of the broadcasted tensor
    size_t curLength = data.size();
-   size_t targetLength = broadcastedData.size();
-   assert(ConvertShapeToLength(targetShape) == targetLength);
    // special case when broadcasting last dimensions (initial shapes must be the same)
-   if (shape.front() == targetShape.front() && shape.back() == 1 && size > 1) {
+   if (size > 1 && shape.front() == targetShape.front() && shape.back() == 1) {
       size_t bsize = targetShape.back();
       // compute the size of the data to broadcast
       for (int k = int(size)-2; k >=0; k--) {
@@ -360,16 +425,16 @@ void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const st
          bsize *= targetShape[k];
       }
       for (size_t i = 0; i < curLength; i++) {
-         std::fill(broadcastedData.begin() + i*bsize, broadcastedData.begin() + (i+1)*bsize , data[i]);
+         std::fill(broadcastedData + i*bsize, broadcastedData + (i+1)*bsize , data[i]);
       }
       return;
    }
 
-   std::copy(data.begin(), data.end(), broadcastedData.begin());
+   std::copy(data.begin(), data.end(), broadcastedData);
    // Product of the previous dimensions of targetShape
    size_t arrayNum = 1;
    // New broadcasted data: is this needed?
-   std::vector<T> newData(targetLength);
+   std::vector<T> newData(ConvertShapeToLength(targetShape));
 
    for (size_t idx = 0; idx < size; idx++) {
       size_t dim = shape[idx];
@@ -385,8 +450,8 @@ void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const st
             for (size_t arrayIdx = 0; arrayIdx < arrayNum; arrayIdx++) {
                for (size_t targetIdx = 0; targetIdx < targetDim; targetIdx++) {
                   size_t offset = arrayIdx * arrayLength * targetDim + targetIdx * arrayLength;
-                  std::copy(broadcastedData.begin() + arrayIdx * arrayLength,
-                     broadcastedData.begin() + (arrayIdx + 1) * arrayLength,
+                  std::copy(broadcastedData + arrayIdx * arrayLength,
+                     broadcastedData + (arrayIdx + 1) * arrayLength,
                      newData.begin() + offset);
                }
             }
@@ -400,12 +465,11 @@ void BroadcastTensor(ConstContT data, const std::vector<size_t>& shape, const st
          // Update current length
          curLength = newLength;
          // Update broadcasted data
-         std::copy(newData.begin(), newData.begin() + newLength, broadcastedData.begin());
+         std::copy(newData.begin(), newData.begin() + newLength, broadcastedData);
       }
       // Update the number of arrays
       arrayNum *= targetDim;
    }
-   //return broadcastedData;
 }
 
 // interface where we allocate a new array for broadcasted data
@@ -413,10 +477,8 @@ template<typename T>
 T* CreateBroadcastTensor(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, size_t targetLength) {
    // newShape is an array of size equal to dimension along which we are broadcasting the tensor
    T* broadcastedData = new T[targetLength];
-   std::span<T> bData(broadcastedData, broadcastedData+targetLength);
    size_t curLength = ConvertShapeToLength(shape);
-   std::span<const T> inData(data, curLength);
-   BroadcastTensor<T, std::span<const T>, std::span<T>>(inData, shape, targetShape, bData);
+   BroadcastTensor<T>({data, curLength}, shape, targetShape, broadcastedData);
    return broadcastedData;
 }
 // Unidirectional broadcasting shape to targetShape// In unidirectional broadcast - only tensor B can have the shape changed not
@@ -429,14 +491,14 @@ T* UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, cons
       std::vector<size_t> newShape(targetSize, 1);
       size_t offset = targetSize - shape.size();
       std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
-      return CreateBroadcastTensor<T>(data, newShape, targetShape, ConvertShapeToLength(targetShape));
+      return CreateBroadcastTensor(data, newShape, targetShape, ConvertShapeToLength(targetShape));
    }
-   return CreateBroadcastTensor<T>(data, shape, targetShape, ConvertShapeToLength(targetShape));
+   return CreateBroadcastTensor(data, shape, targetShape, ConvertShapeToLength(targetShape));
 }
 
 // Unidirectional broadcasting shape to targetShape using a passed vector to avoid allocations
 template<typename T>
-void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, std::span<T> broadcastedData) {
+void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, T *broadcastedData) {
    size_t curLength = ConvertShapeToLength(shape);
    std::span<T> inData(const_cast<T*>(data), curLength);
    // Prepend shape with ones
@@ -445,12 +507,10 @@ void UnidirectionalBroadcast(const T* data, const std::vector<size_t>& shape, co
       std::vector<size_t> newShape(targetSize, 1);
       size_t offset = targetSize - shape.size();
       std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
-      BroadcastTensor<T>(inData, newShape, targetShape, broadcastedData);
+      BroadcastTensor(inData, newShape, targetShape, broadcastedData);
    }
-   BroadcastTensor<T, std::span<T>>(inData, shape, targetShape, broadcastedData);
+   BroadcastTensor(inData, shape, targetShape, broadcastedData);
 }
-// specialization for vector of boolean
-void UnidirectionalBroadcast(const std::vector<bool> & data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, std::vector<bool> & broadcastedData);
 
 /// compute stride of a tensor given its shape (assume layout is row-major)
 std::vector<size_t> ComputeStrideFromShape(const std::vector<size_t> & shape);
@@ -619,7 +679,15 @@ void col2im(const Dtype* data_col, const int channels,
   //std::cout << "finishing col2imp" << std::endl;
 }
 
-
+// Used at the end of infer() to fill the return object.
+template <class T>
+void FillOutput(T const *arr, std::vector<T> &out, std::size_t n)
+{
+   out.resize(n);
+   for (std::size_t i = 0; i < n; ++i) {
+      out[i] = arr[i];
+   }
+}
 
 }  // end namespace UTILITY
 
@@ -631,20 +699,20 @@ extern "C" void sgemm_(const char * transa, const char * transb, const int * m,
 
 
 struct GNN_Data {
-      TMVA::Experimental::RTensor<float> node_data;      // the node feature data, tensor with shape (num_nodes, num_node_features)
-      TMVA::Experimental::RTensor<float> edge_data;      // the edge feature data, tensor with shape (num_edges, num_edge_features)
-      TMVA::Experimental::RTensor<float> global_data;    // the global features, tensor with shape (1, num_global_features)
-      TMVA::Experimental::RTensor<int> edge_index;       // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges)
+      RTensor<float> node_data;      // the node feature data, tensor with shape (num_nodes, num_node_features)
+      RTensor<float> edge_data;      // the edge feature data, tensor with shape (num_edges, num_edge_features)
+      RTensor<float> global_data;    // the global features, tensor with shape (1, num_global_features)
+      RTensor<int> edge_index;       // the edge index (receivers and senders for each edge), tensor with shape (2, num_edges)
                                      // edge_index[0,:] are the receivers and edge_index[1,:] are the senders
 
 
       // need to have default constructor since RTensor has not one
-      GNN_Data(): node_data(TMVA::Experimental::RTensor<float>({})), edge_data(TMVA::Experimental::RTensor<float>({})), global_data(TMVA::Experimental::RTensor<float>({})), edge_index(TMVA::Experimental::RTensor<int>({})) {}
+      GNN_Data(): node_data(RTensor<float>({})), edge_data(RTensor<float>({})), global_data(RTensor<float>({})), edge_index(RTensor<int>({})) {}
 
 };
 
 template<typename T>
-TMVA::Experimental::RTensor<T> Concatenate( TMVA::Experimental::RTensor<T> & t1,  TMVA::Experimental::RTensor<T> & t2, int axis = 0)
+RTensor<T> Concatenate( RTensor<T> & t1,  RTensor<T> & t2, int axis = 0)
 {
    // concatenate tensor along axis. Shape must be the same except in the dimension of the concatenated axis
    if (t1.GetMemoryLayout() != t2.GetMemoryLayout())
@@ -659,8 +727,8 @@ TMVA::Experimental::RTensor<T> Concatenate( TMVA::Experimental::RTensor<T> & t1,
    }
    std::vector<size_t> outShape = shape1;
    outShape[axis] = shape1[axis] + shape2[axis];
-   TMVA::Experimental::RTensor<T> tout(outShape, t1.GetMemoryLayout());
-   if (t1.GetMemoryLayout() == TMVA::Experimental::MemoryLayout::ColumnMajor) {
+   RTensor<T> tout(outShape, t1.GetMemoryLayout());
+   if (t1.GetMemoryLayout() == MemoryLayout::ColumnMajor) {
       throw std::runtime_error("TMVA RTensor Concatenate is not yet supported for column major tensors");
    }
 
@@ -693,10 +761,10 @@ inline GNN_Data Concatenate(GNN_Data & data1, GNN_Data & data2, int axis = 0) {
 
 inline GNN_Data Copy(const GNN_Data & data) {
    GNN_Data out;
-   out.node_data = TMVA::Experimental::RTensor<float>(data.node_data.GetShape());
-   out.edge_data = TMVA::Experimental::RTensor<float>(data.edge_data.GetShape());
-   out.global_data = TMVA::Experimental::RTensor<float>(data.global_data.GetShape());
-   out.edge_index = TMVA::Experimental::RTensor<int>(data.edge_index.GetShape());
+   out.node_data = RTensor<float>(data.node_data.GetShape());
+   out.edge_data = RTensor<float>(data.edge_data.GetShape());
+   out.global_data = RTensor<float>(data.global_data.GetShape());
+   out.edge_index = RTensor<int>(data.edge_index.GetShape());
    std::copy(data.node_data.GetData(), data.node_data.GetData()+ data.node_data.GetSize(), out.node_data.GetData());
    std::copy(data.edge_data.GetData(), data.edge_data.GetData()+ data.edge_data.GetSize(), out.edge_data.GetData());
    std::copy(data.global_data.GetData(), data.global_data.GetData()+ data.global_data.GetSize(), out.global_data.GetData());
@@ -704,6 +772,70 @@ inline GNN_Data Copy(const GNN_Data & data) {
    return out;
 }
 
-}//SOFIE
+inline void Gemm_Call(float *output, bool transa, bool transb, int m, int n, int k, float alpha, const float *A,
+                      const float *B, float beta, const float *C)
+{
+   char ct = 't';
+   char cn = 'n';
+   const int *lda = transa ? &k : &m;
+   const int *ldb = transb ? &n : &k;
+   const int *ldc = &m;
+   if (C != nullptr) {
+      std::copy(C, C + m * n, output);
+   }
+   SOFIE::BLAS::sgemm_(transa ? &ct : &cn, transb ? &ct : &cn, &m, &n, &k, &alpha, A, lda, B, ldb,
+                                           &beta, output, ldc);
+}
+
+template <class T>
+void ReadTensorFromStream(std::istream &is, T &target, std::string const &expectedName, std::size_t expectedLength)
+{
+   std::string name;
+   std::size_t length;
+   is >> name >> length;
+   if (name != expectedName) {
+      std::string err_msg =
+         "TMVA-SOFIE failed to read the correct tensor name; expected name is " + expectedName + " , read " + name;
+      throw std::runtime_error(err_msg);
+   }
+   if (length != expectedLength) {
+      std::string err_msg = "TMVA-SOFIE failed to read the correct tensor size; expected size is " +
+                            std::to_string(expectedLength) + " , read " + std::to_string(length);
+      throw std::runtime_error(err_msg);
+   }
+   for (size_t i = 0; i < length; ++i) {
+      is >> target[i];
+   }
+   if (is.fail()) {
+      throw std::runtime_error("TMVA-SOFIE failed to read the values for tensor " + expectedName);
+   }
+}
+
+
+// code for the memory greeding allocations
+struct TensorLifeInfo {
+   int begin;   // start time (op index) lifetime
+   int end;     //  end time lifetime
+   size_t size; // size of tensors in bytes
+};
+
+struct MemoryResult {
+  std::size_t total_bytes = 0;  // total memory needed
+  std::vector<size_t> offsets; // resulted offsets for each tensor
+};
+
+/// Greedy best-fit planner with coalescing free list.
+MemoryResult OrganizeMemory(const std::vector<TensorLifeInfo> & tensorsInfo );
+
+
+inline std::string ConvertOutputTypeToString(ETensorType t) {
+   // The std::vector<bool> is a special type that is not wrapping continuous memory.
+   // We don't want to use it as a return type.
+   if (t == ETensorType::BOOL) t = ETensorType::UINT8;
+   return ConvertTypeToString(t);
+}
+
+
+} // namespace SOFIE
 
-#endif //TMVA_SOFIE_RMODEL
+#endif //TMVA_SOFIE_COMMON
diff --git a/src/SOFIE_core/src/RModel.cxx b/src/SOFIE_core/src/RModel.cxx
index e5495ed..3dd1d23 100644
--- a/src/SOFIE_core/src/RModel.cxx
+++ b/src/SOFIE_core/src/RModel.cxx
@@ -4,55 +4,21 @@
 #include <memory>
 #include <string>
 
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
 #include "TFile.h"
+#endif
 
 #include "SOFIE/RModel.hxx"
 #include "SOFIE/SOFIE_common.hxx"
 
-
 namespace SOFIE {
 
-std::underlying_type_t<Options> operator|(Options opA, Options opB) {
-    return static_cast<std::underlying_type_t<Options>>(opA) | static_cast<std::underlying_type_t<Options>>(opB);
-}
-std::underlying_type_t<Options> operator|(std::underlying_type_t<Options> opA, Options opB) {
-    return opA | static_cast<std::underlying_type_t<Options>>(opB);
-}
-
-RModel::RModel(RModel&& other) {
-    fInputTensorInfos = std::move(other.fInputTensorInfos);
-    fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos);
-    fOutputTensorNames = other.fOutputTensorNames;
-    fInputTensorNames = other.fInputTensorNames;
-    fOperators = std::move(other.fOperators);
-    fInitializedTensors = std::move(other.fInitializedTensors);
-    fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos);
-    fName = other.fName;
-    fFileName = other.fFileName;
-    fParseTime = other.fParseTime;
-    fGC = other.fGC;
-    fNeededBlasRoutines = other.fNeededBlasRoutines;
-    fNeededStdLib = other.fNeededStdLib;
+namespace {
+const std::string SP = "   ";
 }
 
-RModel& RModel::operator=(RModel&& other) {
-    fInputTensorInfos = std::move(other.fInputTensorInfos);
-    fReadyInputTensorInfos = std::move(other.fReadyInputTensorInfos);
-    fOutputTensorNames = other.fOutputTensorNames;
-    fInputTensorNames = other.fInputTensorNames;
-    fOperators = std::move(other.fOperators);
-    fInitializedTensors = std::move(other.fInitializedTensors);
-    fIntermediateTensorInfos = std::move(other.fIntermediateTensorInfos);
-    fName = other.fName;
-    fFileName = other.fFileName;
-    fParseTime = other.fParseTime;
-    fGC = other.fGC;
-    fNeededBlasRoutines = other.fNeededBlasRoutines;
-    fNeededStdLib = other.fNeededStdLib;
-    return *this;
-}
 
-const std::vector<size_t>& RModel::GetTensorShape(std::string name) const {
+const std::vector<size_t>& RModel::GetTensorShape(const std::string & name) const {
     auto f = fReadyInputTensorInfos.find(name);
     if (f != fReadyInputTensorInfos.end()) {
         return f->second.shape;
@@ -69,6 +35,16 @@ const std::vector<size_t>& RModel::GetTensorShape(std::string name) const {
     if (f4 != fIntermediateTensorInfos.end()) {
         return f4->second.shape;
     }
+    // case of shape tensors
+    auto f5 = fShapeTensors.find(name);
+    if (f5 != fShapeTensors.end()) {
+      // shape is vector of size 1 with size of shape values or just a scalar
+      if (f5->second.second)  // check scalar flag
+         return std::vector<size_t>{};
+      else
+         return std::vector<size_t>{f5->second.first.size()};
+    }
+
     if (fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end())
       throw std::runtime_error("TMVA SOFIE tensor [" + name + "] is a dynamic tensor. Use GetDynamicTensorShape instead of GetTensorShape");
 
@@ -78,7 +54,7 @@ const std::vector<size_t>& RModel::GetTensorShape(std::string name) const {
     throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found");
 }
 
-std::vector<Dim> RModel::GetDynamicTensorShape(std::string name) const {
+std::vector<Dim> RModel::GetDimTensorShape(const std::string & name) const {
    if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) {
       return f->second.shape;
    }
@@ -89,8 +65,21 @@ std::vector<Dim> RModel::GetDynamicTensorShape(std::string name) const {
    // for this we need to return the vector by value
    return ConvertShapeToDim(GetTensorShape(name));
 }
+std::vector<Dim> RModel::GetDynamicTensorShape(const std::string & name) const {
+   if (auto f = fDynamicTensorInfos.find(name); f != fDynamicTensorInfos.end()) {
+      return f->second.shape;
+   }
+   if (auto f = fInputTensorInfos.find(name); f != fInputTensorInfos.end()) {
+      return f->second.shape;
+   }
+   // throw error if shape is not dynamic
+   if (!IsDynamicTensor(name))
+      throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not dynamic");
+
+   throw std::runtime_error("TMVA SOFIE tensor [" + name + "] for which the shape is requested is not found");
+}
 
-const ETensorType& RModel::GetTensorType(std::string name) const {
+const ETensorType& RModel::GetTensorType(const std::string & name) const {
     auto f = fReadyInputTensorInfos.find(name);
     if (f != fReadyInputTensorInfos.end()) {
         return f->second.type;
@@ -111,6 +100,10 @@ const ETensorType& RModel::GetTensorType(std::string name) const {
     if (f5 != fDynamicTensorInfos.end()){
       return f5->second.type;
     }
+    // case of shape tensor type is INT64
+    if (fShapeTensors.find(name) != fShapeTensors.end()){
+      return ETensorType::INT64;
+    }
 
     if (fIsSubGraph && fParentGraph)
       return fParentGraph->GetTensorType(name);
@@ -124,6 +117,7 @@ bool RModel::CheckIfTensorAlreadyExist(std::string tensor_name) {
     if (fInitializedTensors.find(tensor_name) != fInitializedTensors.end()) return true;
     if (fIntermediateTensorInfos.find(tensor_name) != fIntermediateTensorInfos.end()) return true;
     if (fDynamicTensorInfos.find(tensor_name) != fDynamicTensorInfos.end()) return true;
+    if (fShapeTensors.find(tensor_name) != fShapeTensors.end()) return true;
     if (fIsSubGraph && fParentGraph) return fParentGraph->CheckIfTensorAlreadyExist(tensor_name);
     return false;
 }
@@ -192,16 +186,34 @@ void RModel::AddConstantTensor(std::string tensor_name, ETensorType type, std::v
     tensor_name = UTILITY::Clean_name(tensor_name);
     //NB: own data
     if (CheckIfTensorAlreadyExist(tensor_name)) {
-        throw std::runtime_error("TMVA-SOFIE: initialized tensor with name " + tensor_name + " already exists \n");
+        throw std::runtime_error("TMVA-SOFIE: constant tensor with name " + tensor_name + " already exists \n");
     }
     InitializedTensor new_tensor {type, shape, data, true};   // add here flag to specify is a constant tensor
     fInitializedTensors[tensor_name] = new_tensor;
 }
 
+void RModel::AddShapeTensor(const std::string & name, const std::vector<Dim> & shape_values, bool scalar){
+   auto tensor_name = UTILITY::Clean_name(name);
+   if (fShapeTensors.count(tensor_name) != 0) {
+      throw std::runtime_error("TMVA-SOFIE: shape tensor with name " + tensor_name + " already exists \n");
+   }
+   fShapeTensors[tensor_name] = std::make_pair(shape_values, scalar);
+}
+
+bool RModel::IsShapeTensor(const std::string & tensor_name) const {
+   return fShapeTensors.count(tensor_name) != 0;
+}
+
+const std::vector<Dim> & RModel::GetShapeTensorValues(const std::string & tensor_name) const {
+   //if (!IsShapeTensor(tensor_name) ) return std::vector<Dim>{};
+   return fShapeTensors.at(tensor_name).first;
+}
+
 bool RModel::IsInitializedTensor(const std::string& tensorName) const {
     std::string name = UTILITY::Clean_name(tensorName);
     return fInitializedTensors.find(name) != fInitializedTensors.end();
 }
+
 bool RModel::IsConstantTensor(const std::string& tensorName) const {
     std::string name = UTILITY::Clean_name(tensorName);
     auto itr = fInitializedTensors.find(name);
@@ -209,9 +221,11 @@ bool RModel::IsConstantTensor(const std::string& tensorName) const {
     return itr->second.IsConstantTensor();
 }
 
+// dynamic tensors include also Dim input tensors
 bool RModel::IsDynamicTensor(const std::string& tensorName) const {
    std::string name = UTILITY::Clean_name(tensorName);
-   return fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end();
+   bool ret = fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end();
+   return (ret) ? true : IsDimInputTensor(tensorName);
 }
 bool RModel::IsDimInputTensor(const std::string& tensorName) const {
    std::string name = UTILITY::Clean_name(tensorName);
@@ -250,17 +264,21 @@ void RModel::AddDynamicTensor(std::string tensor_name, ETensorType type, std::ve
    // store shape parameter if not existing
    for (auto &d : shape) {
       if (d.isParam) {
-         if (fShapeParams.count(d.param) == 0) {
-            // case parameter is an expression of some other existing parameter, no need to
-            // register it
-            if (d.dim != size_t(-1)) {
-              fShapeParams[d.param] = std::to_string(d.dim);
-            }
+         if (d.dim != size_t(-1)) {
+            AddShapeParam(d.param, d.dim);
          }
       }
    }
 }
 
+void RModel::AddShapeParam(const std::string & param, size_t default_value) {
+   if (fShapeParams.count(param) == 0) {
+      fShapeParams[param] = std::to_string(default_value);
+      // add also in the vector list (used to keep the order)
+      fDimShapeNames.push_back(param);
+   }
+}
+
 void RModel::AddOutputTensorNameList(std::vector<std::string> outputtensornames) {
     fOutputTensorNames.clear();
     for(auto& it : outputtensornames) {
@@ -293,6 +311,15 @@ std::shared_ptr<void> RModel::GetInitializedTensorData(std::string tensor_name)
     }
 }
 
+void RModel::RemoveInitializedTensor(std::string tensor_name) {
+   auto f = fInitializedTensors.find(tensor_name);
+   if (f == fInitializedTensors.end()) {
+      throw std::runtime_error("TMVA-SOFIE: tensor " + tensor_name + " not found when trying to remove it");
+   } else {
+      fInitializedTensors.erase(f);
+   }
+}
+
 void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) {
       auto t = fInitializedTensors.find(tensor_name);
       if (t == fInitializedTensors.end()) {
@@ -301,100 +328,180 @@ void RModel::SetNotWritableInitializedTensor(const std::string & tensor_name) {
       t->second.SetNotWritable();
    }
 
-std::string RModel:: AllocateIntermediateMemory(std::span<const std::string_view> op_output_tensors) {
+std::string RModel::AllocateIntermediateMemory(std::span<const std::string> op_output_tensors)
+{
+   std::stringstream code;
 
-   std::string memory_allocation_string = "";
-   bool allocated;
+   if (fVerbose) {
+      std::cout << "Total chunks allocated\n";
+      for (auto chunk = fIntermediateMemoryInfo.total_stack.begin(); chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) {
+         std::cout << "..... chunk " << chunk->first << " size " << chunk->second.tensor_size << " " << chunk->second.tensor_name << std::endl;
+      }
+   }
 
-      for (auto& it : op_output_tensors) {
-         allocated = false;
-         if (GetTensorType(std::string(it)) == ETensorType::BOOL ||
-            fInitializedTensors.find(std::string(it)) != fInitializedTensors.end() ||
-            fDynamicTensorInfos.find(std::string(it)) != fDynamicTensorInfos.end()) continue;
+   auto declareIntermediateTensor = [this, &code](std::string const &name, size_t size, size_t location) {
+      std::string typeName = ConvertTypeToString(GetTensorType(name));
+      code << "\n // Allocating memory for intermediate tensor " << name << " with size " << size << " bytes";
+      code << "\n"
+           << typeName << "* tensor_" << name << " = reinterpret_cast<" << typeName
+           << "*>(fIntermediateMemoryPool.data() + " << location << ");\n";
+   };
+
+   if (fVerbose) std::cout << "*** AllocateIntermediateMemory: Loop on op output tensors\n";
+   // order output tensors by size
+   std::vector<TensorMemoryInfo> ordered_output_tensors;
+
+   for (auto &it : op_output_tensors) {
+      auto name = std::string(it);
+      if (GetTensorType(name) == ETensorType::BOOL || fInitializedTensors.find(name) != fInitializedTensors.end() ||
+          fDynamicTensorInfos.find(name) != fDynamicTensorInfos.end())
+         continue;
+
+      auto tensor_size = GetTypeSize(GetTensorType(name)) * ConvertShapeToLength(GetTensorShape(name));
+      // important fill the pair in the ordered output tensors with the string view and not the string
+      TensorMemoryInfo tmi = {it, tensor_size};
+      ordered_output_tensors.push_back(tmi);
+   }
+   std::sort(ordered_output_tensors.begin(), ordered_output_tensors.end(),
+             [](const TensorMemoryInfo &a, const TensorMemoryInfo &b) { return a.tensor_size > b.tensor_size; });
 
-         auto tensor_size = GetTypeSize(GetTensorType(std::string(it))) * ConvertShapeToLength(GetTensorShape(std::string(it)));
-         memory_allocation_string += "\n // Allocating memory for intermediate tensor " + std::string(it) + " with size " + std::to_string(tensor_size) + " bytes";
+   for (auto &it : ordered_output_tensors) {
+      bool allocated = false;
+      std::string name = std::string{it.tensor_name};
+      size_t tensor_size = it.tensor_size;
+      if (fVerbose)
+         std::cout << "output tensor " << name << " size " << tensor_size << std::endl;
 
-            for (auto chunk = fIntermediateMemoryInfo.available_stack.begin(); chunk != fIntermediateMemoryInfo.available_stack.end(); ) {
+      for (auto chunk = fIntermediateMemoryInfo.available_stack.begin();
+           chunk != fIntermediateMemoryInfo.available_stack.end();) {
 
-                  // check if available memory chunks can accommodate the tensor
-                  if (chunk->second >= tensor_size) {
-                     auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it, tensor_size);
-                     auto new_chunk_location = chunk->first+chunk->second-tensor_size;
-                     fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk;
+         if (fVerbose) std::cout << ".. available chunk " << chunk->first << " with size = " << chunk->second;
+         // check if available memory chunks can accommodate the tensor
+         if (chunk->second >= tensor_size) {
+            // need to use here string_view (i.e it.tensor_name)
+            // split returns the new chunk with size of new tensor. The free chunk is before the used one
+            auto new_chunk = fIntermediateMemoryInfo.total_stack[chunk->first].split(it.tensor_name, tensor_size);
+            auto new_chunk_location = chunk->first + chunk->second - tensor_size;
+            fIntermediateMemoryInfo.total_stack[new_chunk_location] = new_chunk;
 
-                     memory_allocation_string += "\n" + ConvertTypeToString(GetTensorType(std::string(it))) +
-                                                "* tensor_" + std::string(it) +
-                                                " = reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(new_chunk_location) + ");\n";
-                     chunk->second -= tensor_size;
+            declareIntermediateTensor(name, tensor_size, new_chunk_location);
+            chunk->second -= tensor_size;
 
-                     allocated = true;
+            allocated = true;
 
-                     if (chunk->second == 0) {
-                        chunk = fIntermediateMemoryInfo.available_stack.erase(chunk);
-                     }
+            if (fVerbose) std::cout << " is re-used and split in a new of size " << new_chunk.tensor_size << " at " << new_chunk_location;
 
-                     break;
-                  }
-                  ++chunk;
+            if (chunk->second == 0) {
+               if (fVerbose) std::cout << " and deleted since size matches";
+               fIntermediateMemoryInfo.available_stack.erase(chunk);
             }
+            if (fVerbose) std::cout << std::endl;
+            break;
+         } else if (chunk->first == fIntermediateMemoryInfo.available_stack.rbegin()->first &&
+                    fIntermediateMemoryInfo.total_stack.rbegin()->first == chunk->first) {
+            // case last available chunk is the last in the memory, we can increase that one
+            fIntermediateMemoryInfo.total_stack[chunk->first] = {it.tensor_name, tensor_size};
+            declareIntermediateTensor(name, tensor_size, chunk->first);
+            fIntermediateMemoryInfo.available_stack.erase(chunk);
+            allocated = true;
+            if (fVerbose) std::cout << " is extended  with a bigger one of size " << tensor_size << std::endl;
+            break;
+         }
+         ++chunk;
+         if (fVerbose) std::cout << std::endl;
+      }
 
-         if (!allocated) {
-               size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty()
-                                 ? 0
-                                 : fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size;
+      if (!allocated) {
+         size_t chunk_idx = fIntermediateMemoryInfo.total_stack.empty()
+                               ? 0
+                               : fIntermediateMemoryInfo.total_stack.rbegin()->first +
+                                    fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size;
 
-               fIntermediateMemoryInfo.total_stack[chunk_idx] =
-                   {
-                     it,
-                     tensor_size
-                   };
+         fIntermediateMemoryInfo.total_stack[chunk_idx] = it;
 
-               memory_allocation_string += "\n"+ConvertTypeToString(GetTensorType(std::string(it)))+"* tensor_"+ std::string(it) + "= reinterpret_cast<"+ConvertTypeToString(GetTensorType(std::string(it)))+"*>(fIntermediateMemoryPool + " + std::to_string(chunk_idx) + ");\n";
-         }
+         declareIntermediateTensor(name, tensor_size, chunk_idx);
+
+         if (fVerbose) std::cout << "no chunk available - add in total stack a new chunk with size of tensor and idx : " << chunk_idx
+                   << std::endl;
+      }
    }
-   return memory_allocation_string;
+   return code.str();
 }
 
-void RModel::CheckAndFlushIntermediateMemory(std::span<const std::string_view> op_input_tensors, const size_t& op_idx){
-   for (auto &it : op_input_tensors){
+void RModel::CheckAndFlushIntermediateMemory(std::span<const std::string> op_input_tensors, const size_t& op_idx){
+   if (fVerbose) std::cout << "*** CheckAndFlushIntermediateMemory: Loop on input tensors for op " << op_idx << "\n";
+   //print available chunks
+   if (fVerbose) std::cout << "available chunks before freeing them : \n";
+   for (auto chunk = fIntermediateMemoryInfo.available_stack.begin();
+        chunk != fIntermediateMemoryInfo.available_stack.end(); chunk++) {
+      if (fVerbose) std::cout << "-- free chunk " << chunk->first <<  " size = " << chunk->second << std::endl;
+   }
+   for (auto &it : op_input_tensors) {
       // last occurence of the tensor is reached => flush it from memory
+      if (fVerbose) std::cout << ".. input tensors : " << it;
       if (fIntermediateTensorFrequencyLookup[it] == op_idx) {
+         if (fVerbose) std::cout << "  flash condition is met - looping on chunks to find matching one \n";
          for (auto chunk = fIntermediateMemoryInfo.total_stack.begin();
-               chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk ) {
-               if (chunk->second.tensor_name == it) {
-
-                     // check if nearby chunks in available memory can coalesce
-                     auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(chunk->first); // smallest element greater than the flushed chunk idx
-                     auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin()) ? fIntermediateMemoryInfo.available_stack.end() : std::prev(first_greater); // largest element smaller than the flushed chunk idx
-
-                     // check if the next stack entry is actually adjacent in memory
-                     if (last_smaller->first+last_smaller->second + 1 == chunk->first){
-                        last_smaller->second += chunk->second.tensor_size;
-                        fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second);
-
-                        if (last_smaller->first + last_smaller->second + 1 == first_greater->first){
-                              fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]);
-                              first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater);
-                        }
-                     } else{
-                        if (chunk->first + chunk->second.tensor_size + 1 == first_greater->first){
-                           fIntermediateMemoryInfo.total_stack[chunk->first].merge(fIntermediateMemoryInfo.total_stack[first_greater->first]);
-                           first_greater = fIntermediateMemoryInfo.available_stack.erase(first_greater);
-                        }
-                        fIntermediateMemoryInfo.available_stack.insert({
-                           chunk->first,
-                           chunk->second.tensor_size
-        });
-                     }
+              chunk != fIntermediateMemoryInfo.total_stack.end(); ++chunk) {
+            if (fVerbose) std::cout << "---  chunk " << chunk->first << " , " << chunk->second.tensor_name << " size " << chunk->second.tensor_size;
+            if (chunk->second.tensor_name == it) {
+               if (fVerbose) std::cout << " --  Found chunk corresponding to input tensor:  " << chunk->first;
+               // check if nearby chunks in available memory can coalesce
+               auto first_greater = fIntermediateMemoryInfo.available_stack.upper_bound(
+                  chunk->first); // smallest element greater than the flushed chunk idx
+               auto last_smaller = (first_greater == fIntermediateMemoryInfo.available_stack.begin())
+                                      ? fIntermediateMemoryInfo.available_stack.end()
+                                      : std::prev(first_greater); // largest element smaller than the flushed chunk idx
+
+               // check if the next stack entry is actually adjacent in memory
+
+               if (last_smaller != fIntermediateMemoryInfo.available_stack.end() &&
+                   last_smaller->first + last_smaller->second == chunk->first) {
+                  // merge chunk with previous one
+                  last_smaller->second += chunk->second.tensor_size;
+                  fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(chunk->second);
+                  if (fVerbose) std::cout << " is adjacent in memory with previous one - merge ";
+                  if (first_greater != fIntermediateMemoryInfo.available_stack.end() &&
+                      last_smaller->first + last_smaller->second == first_greater->first) {
+                     // merge also with following one
+                     last_smaller->second += first_greater->second;
+                     fIntermediateMemoryInfo.total_stack[last_smaller->first].merge(
+                        fIntermediateMemoryInfo.total_stack[first_greater->first]);
+                     // delete merged one in available stack and in total stack
+                     fIntermediateMemoryInfo.total_stack.erase(first_greater->first);
+                     fIntermediateMemoryInfo.available_stack.erase(first_greater);
+                     if (fVerbose) std::cout << " merge also with following that is free ";
+                  }
+                  fIntermediateMemoryInfo.total_stack.erase(chunk->first);
+                  if (fVerbose) std::cout << std::endl;
+                  break;
+               } else if (first_greater != fIntermediateMemoryInfo.available_stack.end() &&
+                          chunk->first + chunk->second.tensor_size == first_greater->first) {
+                  // merge with first greater
+                  if (fVerbose) std::cout << " is adjacent in memory with following one - merge \n";
+                  // cannot modify idx of first_greter. Insert a new one and delete previous one
+                  size_t new_size = chunk->second.tensor_size + first_greater->second;
+                  size_t first_greater_idx = first_greater->first;
+                  fIntermediateMemoryInfo.available_stack.erase(first_greater);
+                  // cannot use anymore first_greater
+                  fIntermediateMemoryInfo.available_stack.insert({chunk->first, new_size});
+                  fIntermediateMemoryInfo.total_stack[chunk->first].merge(
+                     fIntermediateMemoryInfo.total_stack[first_greater_idx]);
+                  fIntermediateMemoryInfo.total_stack.erase(first_greater_idx);
+               } else {
+                  fIntermediateMemoryInfo.available_stack.insert({chunk->first, chunk->second.tensor_size});
+                  if (fVerbose) std::cout << " insert in the available stack the chunk with size " << chunk->second.tensor_size << std::endl;
                }
+               chunk->second.tensor_name = "free";
+               break;
+            }
          }
+      } else {
+         if (fVerbose) std::cout << std::endl;
       }
    }
 }
 
-
-
 void RModel::Initialize(int batchSize, bool verbose) {
    std::map<std::string, size_t> inputParams;
    if (batchSize > 0) {
@@ -442,7 +549,7 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       auto shape = ConvertShapeToInt(input.second.shape);
       if (verbose)
          std::cout << "converting input shape for " << input.first << " " << ConvertShapeToString(shape) << " from "
-            << ConvertDynamicShapeToString(input.second.shape) << std::endl;
+            << ConvertDimShapeToString(input.second.shape) << std::endl;
       if (!shape.empty()) {
          // case shape is defined (not parametric) we add the tensor in the fReadyInputTensorInfos map and
          // we remove the tensor from the fInputTensorInfo where th eold parametric shape was stored
@@ -456,8 +563,12 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       else {
          // store the found parametric shape parameters
          for (auto &d : input.second.shape) {
-            if (d.isParam)
-               fShapeParams[d.param] = std::to_string(d.dim);
+            if (d.isParam) {
+               if (fShapeParams.count(d.param) == 0) {
+                  fDimShapeNames.push_back(d.param);
+                  fShapeParams[d.param] = std::to_string(d.dim);
+               }
+            }
          }
       }
    }
@@ -492,10 +603,11 @@ void RModel::Initialize(const std::map<std::string, size_t> & inputParams, bool
       }
       fOperators[op_idx]->Initialize(*this);
       for(auto &it:fOperators[op_idx]->GetOpOutputTensors()){
+         std::string name = std::string{it};
          if (fIntermediateTensorFrequencyLookup.find(it) == fIntermediateTensorFrequencyLookup.end() &&
-             std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), std::string(it)) == fOutputTensorNames.end() &&
-             fInitializedTensors.find(std::string(it)) == fInitializedTensors.end() &&
-             fDynamicTensorInfos.find(std::string(it)) == fDynamicTensorInfos.end()){
+             std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), name) == fOutputTensorNames.end() &&
+             fInitializedTensors.find(name) == fInitializedTensors.end() &&
+             fDynamicTensorInfos.find(name) == fDynamicTensorInfos.end()){
             fIntermediateTensorFrequencyLookup[it] = op_idx;
          }
       }
@@ -534,54 +646,21 @@ void RModel::InitializeSubGraph(std::shared_ptr<RModel>  graph) {
 
 }
 
-// Function to generate the code for declaring and initializing constant tensors
-// This is for tensors which are not part of weight files and can be created from the Constant operator
-template <typename T>
-std::string GenerateConstantTensorCode(const std::pair<std::string, InitializedTensor> &t)
-{
-   std::stringstream strs;
-   std::string type = ConvertTypeToString(t.second.type());
-   size_t length = ConvertShapeToLength(t.second.shape());
-   // avoid using stack sizes for constant tensors to reduce compilation time
-   bool allocateOnStack = (length > 100) ? false : true;
-
-   const T *data = t.second.data<T>();
-
-   // and check if all values are the same
-   bool sameData = false;
-   // for non stack allocation check if data are the same
-   if (!allocateOnStack && length > 1) {
-      size_t idx = 1;
-      do {
-         sameData = (data[idx] == data[idx - 1]);
-         idx++;
-      } while (sameData && idx < length);
-   }
-   if (allocateOnStack) {
-      strs << type << " tensor_" << t.first << "[" << length << "] = " << ConvertValuesToString(length, data) << ";\n";
-   } else {
-      strs << "std::vector<" << type << "> fTensor_" << t.first << " = ";
-      if (sameData)
-         strs << "std::vector<" << type << ">(" << length << ", " << ConvertValToString(data[0]) << ");\n";
-      else {
-         strs << ConvertValuesToString(length, data) << ";\n";
-      }
-      strs << "const " << type << " * tensor_" + t.first + " = fTensor_" + t.first + ".data();\n";
-   }
-   return strs.str();
-}
-
 void RModel::GenerateInitializedTensorInfo()
 {
    if (!fInitializedTensors.empty())
       fGC += "// initialized tensors\n";
 
    for (auto &i : fInitializedTensors) {
+      if (i.second.IsNotWritable())  continue;
       if (!fUseWeightFile || i.second.IsConstantTensor()) {
-         if (i.second.type() == ETensorType::FLOAT)
+         if (i.second.type() == ETensorType::FLOAT) {
             fGC += GenerateConstantTensorCode<float>(i);
-         else if (i.second.type() == ETensorType::INT64)
+            fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
+         } else if (i.second.type() == ETensorType::INT64) {
             fGC += GenerateConstantTensorCode<int64_t>(i);
+            fConstantTensorSize += ConvertShapeToLength(i.second.shape()) * 8;
+         }
 
       } else {
          // case of tensors which are read from a file
@@ -589,43 +668,55 @@ void RModel::GenerateInitializedTensorInfo()
          if (i.second.type() == ETensorType::FLOAT) {
             fGC += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
             fGC += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+            fWeightsTensorSize += ConvertShapeToLength(i.second.shape()) * 4;
          }
       }
    }
 }
 
 void RModel::GenerateIntermediateMemoryPool() {
-   if (fIntermediateMemoryInfo.total_stack.size() == 0) return;
+   if (fIntermediateMemoryInfo.total_stack.empty()) return;
    fGC += "\n//--- Allocating session memory pool to be used for allocating intermediate tensors\n";
 
    // char memory block is allocated since char takes 1 byte, thus easier to allocate tensors
    // of other data types
-   fGC += "char* fIntermediateMemoryPool = new char[" + std::to_string(fIntermediateMemoryInfo.total_stack.rbegin()->first + fIntermediateMemoryInfo.total_stack.rbegin()->second.tensor_size)+ "];\n\n";
+   auto const &totalStack = fIntermediateMemoryInfo.total_stack;
+   const size_t memPoolSize = totalStack.rbegin()->first + totalStack.rbegin()->second.tensor_size;
+   fGC += "std::vector<char> fIntermediateMemoryPool = std::vector<char>(" + std::to_string(memPoolSize) + ");\n\n";
 }
 
 void RModel::GenerateIntermediateTensorInfo() {
    if (!fIntermediateTensorInfos.empty()) {
       std::string tensor_declaration_block = "";
-
       for (auto &i : fIntermediateTensorInfos) {
          if (i.second.type == ETensorType::BOOL) {
-               tensor_declaration_block += "std::vector<bool> fTensor_" + i.first + " = std::vector<bool>(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n";
-               // No pointer allocation needed for BOOL
+               tensor_declaration_block += "std::vector<std::uint8_t> fTensor_" + i.first + " = std::vector<std::uint8_t>(" + std::to_string(ConvertShapeToLength(i.second.shape)) + ");\n";
+               tensor_declaration_block += "std::uint8_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               continue;
          }
-         if (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end() && std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end()) {
+         bool is_extended = (fOptimizationLevel == OptimizationLevel::kExtended);
+         bool not_in_freq_map =
+            (fIntermediateTensorFrequencyLookup.find(i.first) == fIntermediateTensorFrequencyLookup.end());
+         bool not_in_output_names =
+            (std::find(fOutputTensorNames.begin(), fOutputTensorNames.end(), i.first) == fOutputTensorNames.end());
+
+         if ((not_in_freq_map && not_in_output_names) || (!not_in_freq_map && !is_extended && not_in_output_names)) {
             size_t length = ConvertShapeToLength(i.second.shape);
 
             if (i.second.type == ETensorType::FLOAT) {
                tensor_declaration_block += "std::vector<float> fTensor_" + i.first + " = std::vector<float>(" + std::to_string(length) + ");\n";
                tensor_declaration_block += "float * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               fOtherTensorSize += 4 * length;
             }
             else if (i.second.type == ETensorType::DOUBLE) {
                tensor_declaration_block += "std::vector<double> fTensor_" + i.first + " = std::vector<double>(" + std::to_string(length) + ");\n";
                tensor_declaration_block += "double * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               fOtherTensorSize += 8 * length;
             }
             else if (i.second.type == ETensorType::INT64) {
                tensor_declaration_block += "std::vector<int64_t> fTensor_" + i.first + " = std::vector<int64_t>(" + std::to_string(length) + ");\n";
                tensor_declaration_block += "int64_t * tensor_" + i.first + " = fTensor_" + i.first + ".data();\n";
+               fOtherTensorSize += 8 * length;
             }
          }
       }
@@ -664,17 +755,17 @@ void RModel::GenerateOperatorDeclarations() {
    fGC += "\n";
 }
 
-void RModel::GenerateDynamicTensorInfo() {
-    fGC += "//---- allocate the intermediate dynamic tensors\n";
-    std::stringstream out;
-    for (auto & i: fDynamicTensorInfos) {
-        auto length = ConvertDynamicShapeToLength(i.second.shape);
-        out << SP <<  "if (" << length << " > 0) {\n";
-        out << SP << SP <<  "fTensor_" <<  i.first  <<  ".resize(" <<  length << ");\n";
-        out << SP << SP <<  "tensor_" << i.first << " = fTensor_" << i.first  << ".data();\n";
-        out << SP << "}\n";
-    }
-    fGC += out.str();
+void RModel::GenerateDynamicTensorInfo()
+{
+   std::stringstream out;
+   for (auto &i : fDynamicTensorInfos) {
+      auto length = ConvertDimShapeToLength(i.second.shape);
+      out << SP << "if (" << length << " > 0) {\n";
+      out << SP << SP << "fTensor_" << i.first << ".resize(" << length << ");\n";
+      out << SP << SP << "tensor_" << i.first << " = fTensor_" << i.first << ".data();\n";
+      out << SP << "}\n";
+   }
+   fGC += out.str();
 }
 
 std::string RModel::GenerateInferSignature(bool isdecl) {
@@ -702,7 +793,7 @@ std::string RModel::GenerateInferSignature(bool isdecl) {
          if (type == "other")
             throw std::runtime_error("TMVA-SOFIE: input tensor " + name +
                                      " is of a data type which is not yet supported.");
-         rGC += type + "* ";
+         rGC += type + " const* ";
       }
       rGC += "tensor_" + name + ",";
       i_input++;
@@ -712,96 +803,73 @@ std::string RModel::GenerateInferSignature(bool isdecl) {
    return rGC;
 }
 
-namespace {
-
-std::string createOutputTensor(RModel const &rmodel, std::string const &name, bool isIntermediateTensor)
+void RModel::GenerateOutput()
 {
-   if(name.empty()) return "{}";
-   ETensorType eOutputType = rmodel.GetTensorType(name);
-   std::string outputType = ConvertTypeToString(eOutputType);
-   if (isIntermediateTensor) {
-
-      if (eOutputType == ETensorType::BOOL) {
-         return "fTensor_" + name;
-      } else {
-         // need to check is size is the same(don't want to return a vector with larger size)
-         // in that case better to copy
-         return "std::vector<" + ConvertTypeToString(eOutputType) + ">(tensor_" + name + ", tensor_" + name + " + " +
-                std::to_string(ConvertShapeToLength(rmodel.GetTensorShape(name))) + ")";
-      }
-   }
-   // include also dynamic tensors since the vectors can be allocated with a size larger than their output
-   // we need a special handling for bool type allocated as vector<bool>
-   auto outputLength = ConvertDynamicShapeToLength(rmodel.GetDynamicTensorShape(name));
-   if (rmodel.IsDynamicTensor(name) && eOutputType == ETensorType::BOOL) {
-      return "std::vector<bool>(fTensor_" + name + ".begin(), fTensor_" + name + ".begin() + " + outputLength + ")";
-   }
-   return "std::vector<" + outputType + ">(tensor_" + name + ", tensor_" + name + " + " + outputLength + ")";
-}
-
-} // namespace
-
-void RModel::GenerateOutput() {
-
-   if (fVerbose)
-      std::cout << "Generating main inference code for " << fName << std::endl;
-
    size_t outputSize = fOutputTensorNames.size();
    // assume output types are all the same
-   if (outputSize == 0)
-      throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
 
    bool sameOutputTypes = true;
    std::string inferReturnType; // type return by infer function
-   ETensorType eOutputType = GetTensorType(*fOutputTensorNames.begin());
-   std::string outputType = ConvertTypeToString(eOutputType);
+   ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin());
    fGC += "\n\n";
    if (outputSize == 1) {
-      fGC += "std::vector<" + outputType + ">";
+      fGC += "std::vector<" + ConvertOutputTypeToString(eFirstOutputType) + ">";
    } else {
       // if all output types are the same we return an std::vector - otherwise a tuple
-      for (size_t i = 1; i < outputSize; i++) {
-         if (GetTensorType(fOutputTensorNames[i]) != eOutputType)
+      for (std::string const &name : fOutputTensorNames) {
+         if (GetTensorType(name) != eFirstOutputType)
             sameOutputTypes = false;
       }
       if (sameOutputTypes)
-         fGC += "std::vector<std::vector<" + outputType + ">>";
+         fGC += "std::vector<std::vector<" + ConvertOutputTypeToString(eFirstOutputType) + ">>";
       else {
          inferReturnType = "std::tuple<";
          for (size_t i = 0; i < outputSize; i++) {
-            inferReturnType += "std::vector<" + ConvertTypeToString(GetTensorType(fOutputTensorNames[i])) + ">";
-            if (i < outputSize-1) inferReturnType += ",";
+            inferReturnType += "std::vector<" + ConvertOutputTypeToString(GetTensorType(fOutputTensorNames[i])) + ">";
+            if (i < outputSize - 1)
+               inferReturnType += ",";
          }
          inferReturnType += ">";
          fGC += inferReturnType;
       }
    }
 
-   fGC += " infer(";
+   fGC += " infer(" + GenerateInferSignature() + "){\n";
 
-   fGC += GenerateInferSignature();
-
-   fGC += "){\n";
-
-   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
-      if (fVerbose) std::cout << "Generating code for operator .... " << op_idx << std::endl;
-      fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
+   std::string doInferArgs = GenerateInferSignature(false);
+   if (!doInferArgs.empty())
+      doInferArgs += ",";
+   for (std::string const &name : fOutputTensorNames) {
+      fGC += SP + "std::vector<" + ConvertOutputTypeToString(GetTensorType(name)) + " > output_tensor_" + name + ";\n";
+      doInferArgs += " output_tensor_" + name + ",";
    }
+   if (!doInferArgs.empty())
+      doInferArgs.back() = ' ';
+
+   fGC += SP + "doInfer(" + doInferArgs + ");\n";
 
    fGC += SP + "return {";
-   for (size_t i = 0; i < outputSize; i++) {
-      std::string tensorName = *(fOutputTensorNames.begin() + i);
-      bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0;
-      fGC += createOutputTensor(*this, tensorName, isIntermediate);
-      if (i < outputSize - 1)
+   for (size_t i = 0; i < fOutputTensorNames.size(); i++) {
+      fGC += "output_tensor_" + fOutputTensorNames[i];
+      if (i < fOutputTensorNames.size() - 1)
          fGC += ",";
    }
    fGC += "};\n";
-   fGC += "}\n";  // end of infer function scope
+   fGC += "}\n"; // end of infer function scope
 }
 
 void RModel::GenerateSessionCode()
 {
+   // Determine the signature of the actual inference function
+   std::string doInferSignature = GenerateInferSignature();
+   if (!doInferSignature.empty())
+      doInferSignature += ", ";
+   for (auto const &name : fOutputTensorNames) {
+      doInferSignature += " std::vector<" + ConvertOutputTypeToString(GetTensorType(name)) + "> &output_tensor_" + name + ",";
+   }
+   doInferSignature.back() = ' ';
+
+   doInferSignature = "void doInfer(" + doInferSignature + ")";
 
    // define the Session struct (for GNN this is generated in RModel_GNN)
    if (fUseSession && !fIsGNNComponent) {
@@ -814,24 +882,31 @@ void RModel::GenerateSessionCode()
    // generate code for declaring the initialized tensors
    GenerateInitializedTensorInfo();
 
-   // evaluate total intermediate memory and position intermediate tensor addresses
-   std::string intermediate_memory_alloc_string = "";
-   intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --";
-   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
-      intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors());
-      CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx);
-   }
+   if (fOptimizationLevel == OptimizationLevel::kExtended) {
+      // evaluate total intermediate memory and position intermediate tensor addresses
+      std::string intermediate_memory_alloc_string = "";
+      intermediate_memory_alloc_string += "\n// --- Positioning intermediate tensor memory --";
+      for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+         if (fVerbose) {
+            auto op = fOperators[op_idx].get();
+            std::cout << "\n******************\n analyzing input/output operator " << op_idx << "  "
+                      << typeid(*op).name() << std::endl;
+         }
+         intermediate_memory_alloc_string += AllocateIntermediateMemory(fOperators[op_idx]->GetOpOutputTensors());
+         CheckAndFlushIntermediateMemory(fOperators[op_idx]->GetOpInputTensors(), op_idx);
+      }
 
-   // to check remaining unused fragments after memory allocation (lesser the better)
-   // for (const auto &it: fIntermediateMemoryInfo.available_stack){
-   //    std::cout<<"chunk_idx: "<<it.first<<", chunk_size: "<<it.second<<"\n";
-   // }
+      // to check remaining unused fragments after memory allocation (lesser the better)
+      // for (const auto &it: fIntermediateMemoryInfo.available_stack){
+      //    std::cout<<"chunk_idx: "<<it.first<<", chunk_size: "<<it.second<<"\n";
+      // }
 
-   // generate the memory pool to be used by intermediate tensors
-   GenerateIntermediateMemoryPool();
+      // generate the memory pool to be used by intermediate tensors
+      GenerateIntermediateMemoryPool();
 
-   // position intermediate tensors
-   fGC += intermediate_memory_alloc_string;
+      // position intermediate tensors
+      fGC += intermediate_memory_alloc_string;
+   }
 
    // generate the declaring the intermediate tensors
    GenerateIntermediateTensorInfo();
@@ -875,10 +950,10 @@ void RModel::GenerateSessionCode()
       }
       // add initialization of shape parameters
       // assume all parameters are of type size_t
-      if (!fShapeParams.empty()) {
-         for (auto &p : fShapeParams) {
+      if (!fDimShapeNames.empty()) {
+         for (auto &p : fDimShapeNames) {
             fGC += ",\n";
-            fGC += "        size_t " + p.first + " = " + p.second;
+            fGC += "        size_t " + p + " = " + fShapeParams[p];
          }
       }
       fGC += ") {\n";
@@ -900,12 +975,42 @@ void RModel::GenerateSessionCode()
 
       fGC += "}\n\n";
    }
+
+   fGC += doInferSignature + "{\n";
+   fGC += "\n";
+
    // generate the inference code
+   if (fVerbose)
+      std::cout << "Generating main inference code for " << fName << std::endl;
+
+   if (fOutputTensorNames.size() == 0)
+      throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
+
+   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+      if (fVerbose)
+         std::cout << "Generating code for operator .... " << op_idx << std::endl;
+      fGC += (fOperators[op_idx]->Generate(std::to_string(op_idx)));
+   }
+
+   fGC += SP + "using SOFIE::UTILITY::FillOutput;\n\n";
+
+   for (std::string const &name : fOutputTensorNames) {
+      // need to check is size is the same (don't want to return a vector with
+      // larger size) in that case better to copy
+      bool isIntermediate = fIntermediateTensorInfos.count(name) > 0;
+      std::string n = isIntermediate ? std::to_string(ConvertShapeToLength(GetTensorShape(name)))
+                                     : ConvertDimShapeToLength(GetDimTensorShape(name));
+      fGC += SP + "FillOutput(tensor_" + name + ", output_tensor_" + name + ", " + n + ");\n";
+   }
+
+   fGC += "}\n\n";
+
+   // generate the inference overload that returns an output struct
    GenerateOutput();
 
    // end of session
    if (fUseSession && !fIsGNNComponent) {
-      fGC += "};   // end of Session\n";
+      fGC += "};   // end of Session\n\n";
    }
 }
 
@@ -982,8 +1087,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) {
             fGC += "   f.seekg(" + std::to_string(pos) + ");\n";
         }
 
-        fGC += "   std::string tensor_name;\n";
-        fGC += "   size_t length;\n";
+        fGC += "   using SOFIE::ReadTensorFromStream;\n";
 
         // loop on tensors and parse the file
         for (auto& i: fInitializedTensors) {
@@ -991,25 +1095,8 @@ void RModel::ReadInitializedTensorsFromFile(long pos) {
             if (!i.second.IsWeightTensor()) continue;
             std::string tensor_name = "tensor_" + i.first;
             if (i.second.type() == ETensorType::FLOAT) {
-                size_t length = 1;
-                length = ConvertShapeToLength(i.second.shape());
-                std::string slength = std::to_string(length);
-                fGC += "   f >> tensor_name >> length;\n";
-                fGC += "   if (tensor_name != \"" + tensor_name + "\" ) {\n";
-                fGC += "      std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor name; expected name is " +
-                       tensor_name + " , read \" + tensor_name;\n";
-                fGC += "      throw std::runtime_error(err_msg);\n";
-                fGC += "    }\n";
-                fGC += "   if (length != " + slength + ") {\n";
-                fGC += "      std::string err_msg = \"TMVA-SOFIE failed to read the correct tensor size; expected size is " +
-                       slength + " , read \" + std::to_string(length) ;\n";
-                fGC += "      throw std::runtime_error(err_msg);\n";
-                fGC += "    }\n";
-                fGC += "   for (size_t i = 0; i < length; ++i)\n";
-                fGC += "      f >> " + tensor_name + "[i];\n";
-                fGC += "   if (f.fail()) {\n";
-                fGC += "      throw std::runtime_error(\"TMVA-SOFIE failed to read the values for tensor " + tensor_name + "\");\n";
-                fGC += "   }\n";
+               std::string length = std::to_string(ConvertShapeToLength(i.second.shape()));
+               fGC += "   ReadTensorFromStream(f, " + tensor_name + ", \"" + tensor_name + "\", " + length + ");\n";
             } else {
                std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a file");
             }
@@ -1019,6 +1106,7 @@ void RModel::ReadInitializedTensorsFromFile(long pos) {
 
     // generate the code to read initialized tensors from a ROOT data file
     if(fWeightFile == WeightFileType::RootBinary) {
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
         fGC += "  {\n";
         fGC += "   std::unique_ptr<TFile> rootFile(TFile::Open(filename.c_str(), \"READ\"));\n";
         fGC += "   if (!rootFile->IsOpen()) {\n";
@@ -1050,6 +1138,9 @@ void RModel::ReadInitializedTensorsFromFile(long pos) {
             fGC += "  }\n";
         }
         fGC += "  }\n";
+#else
+        throw std::runtime_error("SOFIE was not built with ROOT file support.");
+#endif // SOFIE_SUPPORT_ROOT_BINARY
     }
 }
 
@@ -1075,6 +1166,7 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) {
 
     // Write the initialized tensors to the file
     if (fWeightFile == WeightFileType::RootBinary) {
+#ifdef SOFIE_SUPPORT_ROOT_BINARY
         if(fIsGNNComponent || fIsGNN) {
             throw std::runtime_error("SOFIE-GNN yet not supports writing to a ROOT file.");
         }
@@ -1118,6 +1210,9 @@ long RModel::WriteInitializedTensorsToFile(std::string filename) {
         // this needs to be changed, similar to the text file
         return -1;
 
+#else
+        throw std::runtime_error("SOFIE was not built with ROOT file support.");
+#endif // SOFIE_SUPPORT_ROOT_BINARY
     } else if (fWeightFile == WeightFileType::Text) {
         std::ofstream f;
         if(fIsGNNComponent) {
@@ -1244,9 +1339,9 @@ void RModel::PrintOutputTensors() {
     for (auto& it: fOutputTensorNames) {
         std::cout << "Tensor name: \"" << it << "\"\t";
         if (!IsDynamicTensor(it))
-          std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl;
-       else
-          std::cout << "shape: " << ConvertDynamicShapeToString(GetDynamicTensorShape(it)) << std::endl;
+           std::cout << "shape: " << ConvertShapeToString(GetTensorShape(it)) << std::endl;
+        else
+          std::cout << "shape: " << ConvertDimShapeToString(GetDimTensorShape(it)) << std::endl;
     }
     std::cout << "\n";
 }
@@ -1312,13 +1407,13 @@ void RModel::OutputGenerated(std::string filename, bool append) {
 void RModel::Streamer(TBuffer &R__b) {
     if (R__b.IsReading()) {
         RModel::Class()->ReadBuffer(R__b, this);
-        for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) {
-            i->second.CastPersistentToShared();
+        for (auto & i : fInitializedTensors) {
+            i.second.CastPersistentToShared();
         }
     }
     else {
-        for(auto i=RModel::fInitializedTensors.begin(); i!=RModel::fInitializedTensors.end(); ++i) {
-            i->second.CastSharedToPersistent();
+        for (auto & i : fInitializedTensors) {
+            i.second.CastSharedToPersistent();
         }
         RModel::Class()->WriteBuffer(R__b, this);
     }
diff --git a/src/SOFIE_core/src/RModel_ALPAKA.cxx b/src/SOFIE_core/src/RModel_ALPAKA.cxx
new file mode 100644
index 0000000..f1945b7
--- /dev/null
+++ b/src/SOFIE_core/src/RModel_ALPAKA.cxx
@@ -0,0 +1,447 @@
+#include <algorithm>
+#include <cctype>
+#include <limits>
+#include <memory>
+#include <string>
+
+#include "TFile.h"
+#include "SOFIE/RModel.hxx"
+#include "SOFIE/SOFIE_common.hxx"
+
+namespace SOFIE {
+
+void RModel::GenerateInitializedTensorInfo_GPU_ALPAKA() {
+   if (!fInitializedTensors.empty()){
+      fGC += "\n// initialized tensors for weights\n";
+   }
+
+   for (auto &i : fInitializedTensors) {
+      if (!fUseWeightFile || i.second.IsConstantTensor()) {
+         if (i.second.type() == ETensorType::FLOAT)
+            fGC += GenerateConstantTensorCode<float>(i);
+         else if (i.second.type() == ETensorType::INT64)
+            fGC += GenerateConstantTensorCode<int64_t>(i);
+
+      }
+         // case of tensors which are read from a file
+         size_t length = ConvertShapeToLength(i.second.shape());
+         if (i.second.type() == ETensorType::FLOAT) {
+            fGC += "BufF1D deviceBuf_" + i.first +
+                   " = alpaka::allocBuf<float, Idx>(devAcc, Ext1D::all(Idx{" +
+                   std::to_string(length) + "}));\n";
+         } else if (i.second.type() == ETensorType::INT64) {
+            fGC += "BufI641D deviceBuf_" + i.first +
+                   " = alpaka::allocBuf<int64_t, Idx>(devAcc, Ext1D::all(Idx{" +
+                   std::to_string(length) + "}));\n";
+         }
+   
+   }
+}
+
+void RModel::GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA()
+{
+   if (!fInitializedTensors.empty())
+      fGC += "// temporary initialized tensors for loading weights\n";
+
+   for (auto &i : fInitializedTensors) {
+      if (fUseWeightFile && !i.second.IsConstantTensor()) {
+         // case of tensors which are read from a file
+         size_t length = ConvertShapeToLength(i.second.shape());
+         if (i.second.type() == ETensorType::FLOAT) {
+            fGC += "std::vector<float> tensor_" + i.first + "(" + std::to_string(length) + ");\n";
+         }
+      }
+   }
+}
+
+void RModel::GenerateGPU_ALPAKA_Buffers() {
+   if (!fIntermediateTensorInfos.empty()) {
+      std::string tensor_declaration_block = "";
+
+      for (auto &i : fIntermediateTensorInfos) {
+         if (i.second.type == ETensorType::BOOL) {
+            tensor_declaration_block += "std::vector<bool> fTensor_" + i.first +
+                                        " = std::vector<bool>(" +
+                                        std::to_string(ConvertShapeToLength(i.second.shape)) +
+                                        ");\n";
+            // No pointer allocation needed for BOOL
+         }
+
+         size_t length = ConvertShapeToLength(i.second.shape);
+
+         if (i.second.type == ETensorType::FLOAT) {
+            tensor_declaration_block += "BufF1D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<float, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         } else if (i.second.type == ETensorType::DOUBLE) {
+            tensor_declaration_block += "BufD1D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<double, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         } else if (i.second.type == ETensorType::INT64) {
+            tensor_declaration_block += "BufI641D deviceBuf_" + i.first +
+                                          " = alpaka::allocBuf<int64_t, size_t>(devAcc, Ext1D::all(Idx{" +
+                                          std::to_string(length) + "}));\n";
+         }
+      }
+
+      if (tensor_declaration_block.length()) {
+         fGC += "\n//--- declare and allocate the intermediate tensors\n" + tensor_declaration_block;
+      }
+   }
+
+   // add also the dynamic tensors (only declarations, allocation will be done later)
+   if (!fDynamicTensorInfos.empty()) {
+      fGC += "//--- declare the dynamic tensors\n";
+      fGC += "using bufDev_float = alpaka::Buf<devAcc, float, alpaka::DimInt<1u>, size_t>;\n";
+      fGC += "using bufDev_double = alpaka::Buf<devAcc, double, alpaka::DimInt<1u>, size_t>;\n";
+      fGC += "using bufDev_int64  = alpaka::Buf<devAcc, int64_t, alpaka::DimInt<1u>, size_t>;\n";
+
+      for (auto &i : fDynamicTensorInfos) {
+         if (i.second.type == ETensorType::FLOAT) {
+            fGC += "bufDev_float bufDev_" + i.first + ";\n";
+         } else if (i.second.type == ETensorType::DOUBLE) {
+            fGC += "bufDev_double bufDev_" + i.first + ";\n";
+         } else if (i.second.type == ETensorType::INT64) {
+            fGC += "bufDev_int64 bufDev_" + i.first + ";\n";
+         }
+      }
+   }
+}
+
+void RModel::GenerateDynamicTensorInfo_GPU_ALPAKA() {
+   fGC += "//---- allocate the intermediate dynamic tensors\n";
+   std::stringstream out;
+
+   for (auto &i : fDynamicTensorInfos) {
+      auto length = ConvertDimShapeToLength(i.second.shape);
+      out << SP << "if (" << length << " > 0) {\n";
+      out << "auto bufDev_" + i.first +
+                 " = alpaka::allocBuf<float, size_t>(devAcc, Ext1D::all(Idx{" << length << "}));\n";
+      out << SP << "}\n";
+   }
+   fGC += out.str();
+}
+
+std::string RModel::GenerateInferSignature_GPU_ALPAKA(bool isdecl) {
+   // generate the infer signature given the inputs: eg. "BufF1D const deviceBuf_A, BufF1D const deviceBuf_B"
+   // if (isdecl = false) generate only calling signature (deviceBuf_A, deviceBuf_B, ....)
+
+   auto GetBufType = [this](const std::string& name) -> std::string {
+      ETensorType type = GetTensorType(name);
+      if (type == ETensorType::FLOAT)  return "BufF1D";
+      if (type == ETensorType::DOUBLE) return "BufD1D";
+      if (type == ETensorType::INT64)  return "BufI641D";
+      throw std::runtime_error("TMVA-SOFIE: input tensor " + name +
+                               " is of a data type which is not yet supported.");
+   };
+
+   std::string rGC;
+   std::unordered_map<std::string, int> inputParams;
+   int i_input = 0;
+   for (auto &name : fInputTensorNames) {
+      // if is a dynamic tensor pass initial parameters
+      if (IsDimInputTensor(name)) {
+         auto shape = GetDynamicTensorShape(name);
+         for (auto &d : shape) {
+            std::string pName = d.param;
+            if (d.isParam && inputParams.count(pName) == 0) {
+               if (isdecl) rGC += "size_t ";
+               rGC += d.param + ",";
+               inputParams[pName] = i_input;
+            }
+         }
+      }
+      if (isdecl) {
+         rGC += GetBufType(name) + " const ";
+      }
+      rGC += "deviceBuf_" + name + ",";
+      i_input++;
+   }
+
+   if (fInputTensorNames.size() > 0) rGC.pop_back(); // remove last ","
+   return rGC;
+}
+
+void RModel::GenerateOutput_GPU_ALPAKA() {
+   if (fVerbose)
+      std::cout << "Generating main inference code for " << fName << std::endl;
+
+   size_t outputSize = fOutputTensorNames.size();
+   if (outputSize == 0)
+      throw std::runtime_error("TMVA-SOFIE: output size=0 are not supported");
+
+   bool sameOutputTypes = true;
+   std::string inferReturnType;
+   ETensorType eFirstOutputType = GetTensorType(*fOutputTensorNames.begin());
+
+   fGC += "\n\n";
+   if (outputSize == 1) {
+      fGC += "alpaka::Buf<Acc, " + ConvertOutputTypeToString(eFirstOutputType) + ", Dim, Idx>";
+   } else {
+      // if all output types are the same we return an std::vector - otherwise a tuple
+      for (std::string const &name : fOutputTensorNames) {
+         if (GetTensorType(name) != eFirstOutputType)
+            sameOutputTypes = false;
+      }
+      if (sameOutputTypes)
+         fGC += "std::array<alpaka::Buf<Acc, " + ConvertOutputTypeToString(eFirstOutputType) + ", Dim, Idx>, " + std::to_string(outputSize) + ">";
+      else {
+         inferReturnType = "std::tuple<";
+         for (size_t i = 0; i < outputSize; i++) {
+            inferReturnType += "alpaka::Buf<Acc, " + ConvertOutputTypeToString(eFirstOutputType) + ", Dim, Idx>";
+            if (i < outputSize - 1)
+               inferReturnType += ",";
+         }
+         inferReturnType += ">";
+         fGC += inferReturnType;
+      }
+   }
+
+   fGC += " infer(";
+   fGC += GenerateInferSignature_GPU_ALPAKA();
+   fGC += "){\n";
+
+   for (size_t op_idx = 0; op_idx < fOperators.size(); ++op_idx) {
+      if (fVerbose)
+         std::cout << "Generating code for operator .... " << op_idx << std::endl;
+      fGC += (fOperators[op_idx]->Generate_GPU_ALPAKA(std::to_string(op_idx)));
+   }
+
+   // fGC += "\n\n   alpaka::wait(queue);\n";
+   fGC += SP + "return ";
+   if (outputSize>1) fGC += " {";
+   for (size_t i = 0; i < outputSize; i++) {
+      std::string tensorName = *(fOutputTensorNames.begin() + i);
+      bool isIntermediate = fIntermediateTensorInfos.count(tensorName) > 0;
+      fGC += "deviceBuf_"+tensorName;
+      if (i < outputSize - 1)
+         fGC += ",";
+   }
+   if (outputSize>1) fGC += " };\n";
+   else fGC += ";\n";
+   fGC += "}\n"; // end of infer function scope
+}
+
+void RModel::GenerateSessionCode_GPU_ALPAKA() {
+   
+   std::set<SOFIE::OperatorKind> registered_operators;
+   std::set<SOFIE::OperatorKind> single_initialized_operators = {
+      SOFIE::OperatorKind::RELU,
+      SOFIE::OperatorKind::SIGMOID,
+      SOFIE::OperatorKind::TANH,
+      SOFIE::OperatorKind::SOFTMAX,
+      SOFIE::OperatorKind::LEAKYRELU,
+      SOFIE::OperatorKind::EINSUM,
+      SOFIE::OperatorKind::COMPARISON,
+      SOFIE::OperatorKind::ELU,
+   };
+   bool OpNeedsBlas = false;
+
+   // single initiation operators must only be initialized only once and their count should be stored in the registered_operators set to avoid generating multiple kernels for the same operator kind
+   fGC += "\n//--- ALPAKA Kernels\n";
+   for (size_t id = 0; id < fOperators.size(); id++) {
+      if(fOperators[id]->GetKind() == OperatorKind::GEMM){
+         OpNeedsBlas = true;
+      }
+      if(single_initialized_operators.find(fOperators[id]->GetKind()) != single_initialized_operators.end()) {
+         
+         if(registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) {
+            
+            if (fVerbose)
+            std::cout<<"Generating ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind()) << std::endl;
+         
+            fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id));
+            registered_operators.insert(fOperators[id]->GetKind());
+         }
+      } else {
+         if (fVerbose)
+         std::cout<<"Generating ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind()) << std::endl;
+         fGC += fOperators[id]->Generate_GPU_Kernel_ALPAKA(std::to_string(id));
+      }
+   }
+
+   // define the Session struct (for GNN this is generated in RModel_GNN)
+  fGC += "\n\ntemplate <typename tagAcc>\n";
+   if (fUseSession) {
+      if (!fIsSubGraph)
+         fGC += "struct Session {\n\n";
+      else
+         fGC += "struct Session_" + fName + " {\n\n";
+   }
+
+   // define host and device accelerators
+    fGC += "using Idx = std::size_t;\n";
+    fGC += "using Dim = alpaka::DimInt<1>;\n";
+    fGC += "using Acc = alpaka::TagToAcc<tagAcc, Dim, Idx>;\n";
+    fGC += "using DevAcc = alpaka::Dev<Acc>;\n\n";
+    fGC += "using QueueProperty = alpaka::NonBlocking;\n";
+    fGC += "using QueueAcc = alpaka::Queue<Acc, QueueProperty>;\n\n";
+    fGC += "using BufF1D = alpaka::Buf<Acc, float, Dim, Idx>;\n";
+    fGC += "using BufD1D = alpaka::Buf<Acc, double, Dim, Idx>;\n";
+    fGC += "using BufI641D = alpaka::Buf<Acc, int64_t, Dim, Idx>;\n\n";
+
+    fGC += "\nalpaka::Platform<Acc> const platform{};\n";
+    fGC += "DevAcc devAcc = alpaka::getDevByIdx(platform, 0);\n";
+    fGC += "alpaka::PlatformCpu platformHost{};\n";
+    fGC += "alpaka::DevCpu hostAcc = alpaka::getDevByIdx(platformHost, 0);\n";
+    fGC += "QueueAcc queue{devAcc};\n";
+    fGC += "Idx threadsPerBlock = 256;\n";
+    fGC += "\nusing Ext1D = alpaka::Vec<Dim, Idx>;\n";
+    fGC += "using Vec = alpaka::Vec<Dim, Idx>;\n";
+    if (OpNeedsBlas) {
+         fGC += "\n\n// BLAS declarations\n";
+         fGC += "sofieBLAS<tagAcc> blas{queue};\n";
+    }
+
+   GenerateInitializedTensorInfo_GPU_ALPAKA();
+   GenerateGPU_ALPAKA_Buffers();
+   GenerateOperatorDeclarations();
+
+   // add subgraph session
+   if (!fSubGraphs.empty())
+      fGC += "//   subgraph sessions\n";
+   for (auto &graph : fSubGraphs) {
+      fGC += "Session_" + graph->fName + "  fSession_" + graph->fName + ";\n";
+   }
+
+   // Session constructor
+   if (fUseSession) {
+      std::string sessionName = "\n\nSession";
+      if (fIsSubGraph)
+         sessionName += "_" + fName;
+
+      if (fUseWeightFile) {
+         std::string fileName = fName;
+         if (fWeightFile == WeightFileType::Text)
+            fileName += ".dat";
+         if (fWeightFile == WeightFileType::RootBinary)
+            fileName += ".root";
+
+         fGC += sessionName + "(std::string filename =\"" + fileName + "\"";
+      } else {
+         fGC += sessionName + "(std::string = \"\"";
+      }
+
+      if (!fShapeParams.empty()) {
+         for (auto &p : fShapeParams) {
+            fGC += ",\n";
+            fGC += "        size_t " + p.first + " = " + p.second;
+         }
+      }
+      fGC += ") {\n";
+      
+      GenerateTemporaryInitializedTensorContainers_GPU_ALPAKA();
+      if (fUseWeightFile) {
+         fGC += "\n//--- reading weights from file\n";
+         ReadInitializedTensorsFromFile(0);
+         fGC += "\n";
+      }
+      
+      MoveInitializedTensorsToBuffers_ALPAKA();
+      GenerateDynamicTensorInfo_GPU_ALPAKA();
+
+      for (size_t id = 0; id < fOperators.size(); id++) {
+         fGC += fOperators[id]->GenerateInitCode_GPU_ALPAKA();
+         if (fOperators[id]->GetKind() == OperatorKind::GEMM){
+            fGC += "\nblas.AddLayoutConfig("+fOperators[id]->GetBlasConfig()+");\n";
+         }
+      }
+
+      fGC += "\nalpaka::wait(queue);\n";
+      fGC += "}\n\n";
+   }
+
+   registered_operators.clear();
+
+   for (size_t id = 0; id < fOperators.size(); id++) {
+
+      if(single_initialized_operators.find(fOperators[id]->GetKind()) != single_initialized_operators.end()) {
+         
+         if(registered_operators.find(fOperators[id]->GetKind()) == registered_operators.end()) {
+            
+            if (fVerbose)
+            std::cout<<"Declaring ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind()) << std::endl;
+         
+            fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id));
+            registered_operators.insert(fOperators[id]->GetKind());
+         }
+      } else {
+         if (fVerbose)
+         std::cout<<"Declaring ALPAKA kernel for operator"<< toString(fOperators[id]->GetKind()) << std::endl;
+         fGC += fOperators[id]->Generate_GPU_Kernel_Definitions_ALPAKA(std::to_string(id));
+      }
+   }
+
+   GenerateOutput_GPU_ALPAKA();
+
+   if (fUseSession && !fIsGNNComponent) {
+      fGC += "};   // end of Session\n";
+   }
+}
+
+void RModel::GenerateGPU_ALPAKA(std::underlying_type_t<Options> options, int batchSize, bool verbose) {
+   fVerbose = true;
+   fBatchSize = batchSize;
+
+   if (static_cast<std::underlying_type_t<Options>>(Options::kNoSession) & options) {
+      fUseSession = false;
+      fWeightFile = WeightFileType::None;
+   }
+   if (static_cast<std::underlying_type_t<Options>>(Options::kNoWeightFile) & options) {
+      fUseWeightFile = false;
+      fWeightFile = WeightFileType::None;
+   }
+   if (static_cast<std::underlying_type_t<Options>>(Options::kRootBinaryWeightFile) & options) {
+      fUseWeightFile = true;
+      fWeightFile = WeightFileType::RootBinary;
+   }
+   if (fUseWeightFile && !fUseSession) {
+      throw std::runtime_error(
+          "TMVA-SOFIE: RModel::Generate: cannot use a separate weight file without generating a Session class");
+   }
+
+   if (static_cast<std::underlying_type_t<Options>>(Options::kGNN) & options ||
+       static_cast<std::underlying_type_t<Options>>(Options::kGNNComponent) & options)
+      throw std::runtime_error("SOFIE GPU does not yet supports GNN Inference.");
+
+   Initialize(batchSize, verbose);
+
+   std::string hgname;
+   if (!fIsSubGraph) {
+      fGC.clear();
+      GenerateHeaderInfo_GPU_ALPAKA(hgname);
+   }
+
+   if (fVerbose)
+      std::cout << "generate Main session code - model  " << fName << std::endl;
+
+   GenerateSessionCode_GPU_ALPAKA();
+
+   if (!fIsSubGraph) {
+      fGC += ("} //SOFIE_" + fName + "\n");
+      fGC += "\n#endif  // " + hgname + "\n";
+   }
+}
+
+void RModel::MoveInitializedTensorsToBuffers_ALPAKA(){
+      for (auto &i : fInitializedTensors) {
+         if (i.second.IsNotWritable())  continue;
+         std::string tensor_name = "tensor_" + i.first;
+         auto length = ConvertShapeToLength(i.second.shape());
+         std::string slength = std::to_string(length);
+         if (i.second.type() == ETensorType::FLOAT) {
+            fGC += "     auto hostBuf_"+i.first+" = alpaka::createView(hostAcc, tensor_"+i.first+");\n";
+            fGC += "     alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n";
+         } else if (i.second.type() == ETensorType::DOUBLE) {
+            fGC += "     auto hostBuf_"+i.first+" = alpaka::createView(hostAcc, tensor_"+i.first+");\n";
+            fGC += "     alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n";
+         } else if (i.second.type() == ETensorType::INT64) {
+            fGC += "     auto hostBuf_"+i.first+" = alpaka::createView(hostAcc, tensor_"+i.first+", " + slength + ");\n";
+            fGC += "     alpaka::memcpy(queue, deviceBuf_"+i.first+", hostBuf_"+i.first+");\n";
+         } else {
+            std::runtime_error("tmva-sofie tensor " + tensor_name + " with type " + ConvertTypeToString(i.second.type()) + " cannot be read from a ROOT file");
+         }
+   }
+  }
+
+} // namespace SOFIE
diff --git a/src/SOFIE_core/src/RModel_Base.cxx b/src/SOFIE_core/src/RModel_Base.cxx
index d4d1f1c..f212c53 100644
--- a/src/SOFIE_core/src/RModel_Base.cxx
+++ b/src/SOFIE_core/src/RModel_Base.cxx
@@ -58,6 +58,38 @@ void RModel_Base::GenerateHeaderInfo(std::string& hgname) {
     }
 }
 
+void RModel_Base::GenerateHeaderInfo_GPU_ALPAKA(std::string& hgname) {
+    fGC += ("//Code generated automatically by TMVA for GPU Inference using ALPAKA of Model file [" + fFileName + "] at [" + fParseTime.substr(0, fParseTime.length()-1) +"] \n");
+    // add header guards
+    hgname = fName;
+    std::transform(hgname.begin(), hgname.end(), hgname.begin(), [](unsigned char c) {
+                       return std::toupper(c);
+                   } );
+    hgname = "SOFIE_" + hgname;
+    fGC += "\n#ifndef " + hgname + "\n";
+    fGC += "#define " + hgname + "\n\n";
+    for (auto& i: fNeededStdLib) {
+        fGC += "#include <" + i + ">\n";
+    }
+    for (auto& i: fCustomOpHeaders) {
+        fGC += "#include \"" + i + "\"\n";
+    }
+    fGC += "#include <alpaka/alpaka.hpp>\n";
+    fGC += "#include <sofieBLAS/sofieBLAS.hpp>\n";
+
+    // for the session we need to include SOFIE_Common functions
+    //needed for convolution operator (need to add a flag)
+    fGC += "#include \"SOFIE/SOFIE_common.hxx\"\n";
+    if (fUseWeightFile)
+        fGC += "#include <fstream>\n";
+    // Include TFile when saving the weights in a binary ROOT file
+    if (fWeightFile == WeightFileType::RootBinary)
+        fGC += "#include \"TFile.h\"\n";
+
+    fGC += "\nusing Dim1D = alpaka::DimInt<1>;\n";
+    fGC += "\nnamespace SOFIE_" + fName + "{\n";
+}
+
 void RModel_Base::OutputGenerated(std::string filename, bool append) {
     // the model can be appended only if a file name is provided
     if (filename.empty()) {
diff --git a/src/SOFIE_core/src/RModel_GNN.cxx b/src/SOFIE_core/src/RModel_GNN.cxx
index a1dfe06..3dae254 100644
--- a/src/SOFIE_core/src/RModel_GNN.cxx
+++ b/src/SOFIE_core/src/RModel_GNN.cxx
@@ -94,7 +94,7 @@ void RModel_GNN::Generate() {
 
     // the number of output edges features can be smaller, so we need to correct here
     auto num_edge_features_input = num_edge_features;
-    auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+    auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
       if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) {
           num_edge_features = edges_update_output_shape[1].dim;
     }
@@ -117,7 +117,7 @@ void RModel_GNN::Generate() {
 
     // we need to correct the output number of node features
     auto num_node_features_input = num_node_features;
-    auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+    auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
       if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) {
           num_node_features = nodes_update_output_shape[1].dim;
     }
diff --git a/src/SOFIE_core/src/RModel_GraphIndependent.cxx b/src/SOFIE_core/src/RModel_GraphIndependent.cxx
index bab06b3..cd62d0c 100644
--- a/src/SOFIE_core/src/RModel_GraphIndependent.cxx
+++ b/src/SOFIE_core/src/RModel_GraphIndependent.cxx
@@ -81,7 +81,7 @@ void RModel_GraphIndependent::Generate() {
 
        // the number of output edges features can be smaller, so we need to correct here
        // assume num_edge_features is not a parametric shape
-       auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDynamicTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+       auto edges_update_output_shape =  edges_update_block->GetFunctionBlock()->GetDimTensorShape(edges_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
        if(!edges_update_output_shape[1].isParam && edges_update_output_shape[1].dim != num_edge_features_input) {
           num_edge_features = edges_update_output_shape[1].dim;
        }
@@ -100,7 +100,7 @@ void RModel_GraphIndependent::Generate() {
       fGC+="};\n}\n";
 
       // we need to correct the output number of node features
-      auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDynamicTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+      auto nodes_update_output_shape =  nodes_update_block->GetFunctionBlock()->GetDimTensorShape(nodes_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
        if(!nodes_update_output_shape[1].isParam && nodes_update_output_shape[1].dim != num_node_features_input) {
           num_node_features = nodes_update_output_shape[1].dim;
       }
@@ -119,7 +119,7 @@ void RModel_GraphIndependent::Generate() {
       // we need to correct the output number of global features
       // global features are in shape[1]
 #if 0
-      auto globals_update_output_shape =  globals_update_block->GetFunctionBlock()->GetDynamicTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
+      auto globals_update_output_shape =  globals_update_block->GetFunctionBlock()->GetDimTensorShape(globals_update_block->GetFunctionBlock()->GetOutputTensorNames()[0]);
        if(!globals_update_output_shape[1].isParam && globals_update_output_shape[1].dim != num_global_features_input) {
           num_global_features = globals_update_output_shape[1].dim;
        }
diff --git a/src/SOFIE_core/src/SOFIE_common.cxx b/src/SOFIE_core/src/SOFIE_common.cxx
index ad74313..cd1b60a 100644
--- a/src/SOFIE_core/src/SOFIE_common.cxx
+++ b/src/SOFIE_core/src/SOFIE_common.cxx
@@ -1,15 +1,18 @@
 #include "SOFIE/SOFIE_common.hxx"
-#include<cctype>
+
+#include <cctype>
 #include <sstream>
 #include <stdexcept>
+#include <charconv>
+#include <unordered_map>
+#include <set>
 
-
-namespace SOFIE{
+namespace SOFIE {
 
 /// @brief  Convert shape from integer format to dynamic one (based on Dim)
 /// @param shape
 /// @return shape based on Dim
-std::vector<Dim> ConvertShapeToDim(std::vector<size_t> shape){
+std::vector<Dim> ConvertShapeToDim(const std::vector<size_t> & shape){
    std::vector<Dim> ret_shape(shape.size());
    for (size_t i =0; i < shape.size(); i++){
       ret_shape[i].dim = shape[i];
@@ -20,7 +23,7 @@ std::vector<Dim> ConvertShapeToDim(std::vector<size_t> shape){
 /// @brief Convert shape based on Dim to integer format
 /// @param shape
 /// @return shape based on integer. Return an empty shape in case shape is dynamic (has a parameter)
-std::vector<size_t> ConvertShapeToInt(std::vector<Dim> shape){
+std::vector<size_t> ConvertShapeToInt(const std::vector<Dim> & shape){
    std::vector<size_t> ret_shape(shape.size());
    for (size_t i =0; i < shape.size(); i++){
       if (shape[i].isParam) {
@@ -46,7 +49,7 @@ std::vector<size_t> ConvertShapeToInt(std::vector<Dim> shape){
 }
 
 
-std::size_t ConvertShapeToLength(std::vector<size_t> shape){
+std::size_t ConvertShapeToLength(const std::vector<size_t> & shape){
    // Empty shape represent scalar values, so we return a length=1
    std::size_t fLength = 1;
    for (auto& dim: shape) fLength *= dim;
@@ -58,6 +61,9 @@ std::string ConvertTypeToString(ETensorType type){
       case ETensorType::FLOAT : {
          return "float";
       }
+      case ETensorType::INT8 : {
+         return "int8_t";
+      }
       case ETensorType::INT16 : {
          return "int16_t";
       }
@@ -67,6 +73,9 @@ std::string ConvertTypeToString(ETensorType type){
       case ETensorType::INT64 : {
          return "int64_t";
       }
+      case ETensorType::UINT8 : {
+         return "uint8_t";
+      }
       case ETensorType::UINT16 : {
          return "uint16_t";
       }
@@ -80,7 +89,7 @@ std::string ConvertTypeToString(ETensorType type){
          return "double";
       }
       case ETensorType::BOOL : {
-         return "bool";
+         return "uint8_t";
       }
       default:{
          return "other_" + std::to_string( (int) type);
@@ -106,7 +115,7 @@ ETensorType ConvertStringToType(std::string type){
    }
 }
 
-std::string ConvertShapeToString(std::vector<size_t> shape) {
+std::string ConvertShapeToString(const std::vector<size_t> & shape) {
    std::stringstream out;
    out << "{ ";
    for (size_t i = 0; i < shape.size(); i++) {
@@ -117,41 +126,49 @@ std::string ConvertShapeToString(std::vector<size_t> shape) {
    return out.str();
 }
 
-std::string ConvertDynamicShapeToString(std::vector<Dim> shape) {
+std::string ConvertDimShapeToString(const std::vector<Dim> & shape) {
    std::stringstream out;
    out << "{ ";
    for (size_t i = 0; i < shape.size(); i++) {
-      out << shape[i].GetVal();
+      out << shape[i];
       if (i < shape.size()-1) out << " , ";
    }
    out << " }";
    return out.str();
 }
 
-std::string ConvertDynamicShapeToLength(std::vector<Dim> shape) {
+std::string ConvertDimShapeToLength(const std::vector<Dim> & shape) {
    // convert generic shape to a string
    // multiply all the integer specified dimensions of the shape
    std::string length;
-   size_t int_length = 0;
+   // case of empty vectors return 1
+   if (shape.empty()) return "1";
+   int64_t int_length = -1;
    for (size_t i = 0; i < shape.size(); i++) {
       if (shape[i].isParam) {
          if (!length.empty()) length += " * ";
          length += shape[i].param;
       } else {
-         if (int_length == 0)
+         if (int_length == -1)
             int_length = shape[i].dim;
          else
             int_length *= shape[i].dim;
       }
    }
    // multiply the integer components to the parametric one
-   if (int_length > 0) {
-      if (!length.empty()) length += " * ";
-      length += std::to_string(int_length);
+   // if larger than 1 - otherwise returns -1
+   if (int_length >= 0) {
+      if (!length.empty() && int_length > 1) {
+         length += " * ";
+         length += std::to_string(int_length);
+      } else if (length.empty()) { // case is full known shape
+         length = std::to_string(int_length);
+      }
    }
    return length;
 }
 
+
 namespace{
 template<typename T>
 static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* input, T* target){  //only visible within this translation unit
@@ -169,6 +186,12 @@ static inline void copy_vector_data(int_t no_of_copies, int_t input_size, T* inp
 }
 }
 
+bool IsInteger(const std::string & s) {
+   int value;
+   auto [ptr, ec] = std::from_chars(s.data(), s.data() + s.size(), value);
+   return ec == std::errc() && ptr == s.data() + s.size();
+}
+
 bool UTILITY::AreSameShape(const std::vector<size_t>& shapeA, const std::vector<size_t>& shapeB) {
    if (shapeA.size() != shapeB.size()) {
       return false;
@@ -330,17 +353,24 @@ std::vector<size_t>  UTILITY::MultidirectionalBroadcastShape(std::vector<std::ve
    }
 }
 
-std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> shapeA, std::vector<size_t> shapeB)
+// check multi-directional broadcasting of two shapes (need to pass inputs by non const ref. since we might prepends with one's
+// return a pair of integer flag and new broadcasted shape
+// if flag = 0: shape are identical
+//    flag = 1: return shape is equal to A, we broadcast B
+//    flag = 2: return shape is equal to B we broadcast A
+//    flag = 3: return shape is common of two we broadcast A and B to output
+std::pair<int, std::vector<size_t>>  UTILITY::MultidirectionalBroadcastShape(std::vector<size_t> & shapeA, std::vector<size_t> & shapeB)
 {
    size_t sizeA = shapeA.size();
    size_t sizeB = shapeB.size();
    // Check if A and B have the same shape
    if (UTILITY::AreSameShape(shapeA, shapeB)){
-      return shapeA;
+      return std::make_pair(0, shapeA);
    }
    // Find the common shape of A and B
    size_t size = std::max(sizeA, sizeB);
    if (sizeA < size) {
+      // prepend 1's in A to make of same shape as B
       std::vector<size_t> newShapeA(size, 1);
       size_t offset = size - sizeA;
       std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset);
@@ -359,36 +389,117 @@ std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> s
          break;
       }
    }
+   int broadcastFlag = 0;
    if (broadcastable) {
       // The output shape is max(outShape, targetShape)
       std::vector<size_t> targetShape(size, 1);
       for (size_t i = 0; i < size; i++) {
          targetShape[i] = std::max(shapeA[i], shapeB[i]);
+         if (shapeB[i] < targetShape[i]) broadcastFlag |= 1;
+         if (shapeA[i] < targetShape[i]) broadcastFlag |= 2;
       }
-      return targetShape;
+      return std::make_pair(broadcastFlag, targetShape);
    } else {
       throw
-         std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape "
+         std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape "
             + ConvertShapeToString(shapeA) + " and " + ConvertShapeToString(shapeB)
             + " to a common shape.");
    }
 }
+// unidirectional broadcast- of shape A to target B
+std::vector<size_t>  UTILITY::UnidirectionalBroadcastShape(std::vector<size_t> & shapeA, std::vector<size_t> & shapeB)
+{
+   auto ret = UTILITY::MultidirectionalBroadcastShape(shapeB, shapeA);
+   if (ret.first > 1) {
+      throw
+         std::runtime_error("TMVA::SOFIE - Error unidirectional broadcasting tensors of shape "
+            + ConvertShapeToString(shapeA) + " to  " + ConvertShapeToString(shapeB)
+            + " in a common shape.");
+   }
+   return ret.second;
+}
+
+// for broadcasting Dim shapes
+// flag indicates also which vector needs to be broadcasted
+//    flag & 1 == 1 : broadcast B -> A
+//    flag & 2 == 2 : broadcast A -> B
+//    flag & 4 == 4 a run time check is needed on shapes with values
+std::pair<int, std::vector<Dim>> UTILITY::MultidirectionalBroadcastShape(std::vector<Dim> & shapeA, std::vector<Dim> & shapeB) {
+   size_t sizeA = shapeA.size();
+   size_t sizeB = shapeB.size();
+   // Check if A and B have the same shape
+   if (UTILITY::AreSameShape(shapeA, shapeB)){
+      return std::make_pair(0, shapeA);
+   }
+   // Find the common shape of A and B
+   size_t size = std::max(sizeA, sizeB);
+   if (sizeA < size) {
+      // prepend 1's in A to make of same shape as B
+      std::vector<Dim> newShapeA(size, Dim{1});
+      size_t offset = size - sizeA;
+      std::copy(shapeA.begin(), shapeA.end(), newShapeA.begin() + offset);
+      shapeA = std::move(newShapeA);
+   }
+   if (sizeB < size) {
+      std::vector<Dim> newShapeB(size, Dim{1});
+      size_t offset = size - sizeB;
+      std::copy(shapeB.begin(), shapeB.end(), newShapeB.begin() + offset);
+      shapeB = std::move(newShapeB);
+   }
 
-// UNidirectional boradcast specializaiton for vector<bool>
-
-// specialization for vector of boolean
-void UTILITY::UnidirectionalBroadcast(const std::vector<bool> & data, const std::vector<size_t>& shape, const std::vector<size_t>& targetShape, std::vector<bool> & broadcastedData)
- {
-   // Prepend shape with ones
-   auto ncdata = const_cast<std::vector<bool> &>(data);
-   if (shape.size() < targetShape.size()) {
-      size_t targetSize = targetShape.size();
-      std::vector<size_t> newShape(targetSize, 1);
-      size_t offset = targetSize - shape.size();
-      std::copy(shape.begin(), shape.end(), newShape.begin() + offset);
-      UTILITY::BroadcastTensor<bool, const std::vector<bool> &, std::vector<bool> &>(ncdata, newShape, targetShape, broadcastedData);
-   }
-   UTILITY::BroadcastTensor<bool, const std::vector<bool> &, std::vector<bool> &>(ncdata, shape, targetShape, broadcastedData);
+   int broadcastFlag = 0;
+   // The output shape is targetShape
+   std::vector<Dim> targetShape(size);
+   for (size_t i = 0; i < size; i++) {
+      // assume we broadcast to the parametric value
+      if (shapeA[i] == shapeB[i]) {
+         targetShape[i] = shapeA[i];
+      } else if (shapeA[i].isParam && shapeB[i].GetVal() == "1" ) {
+         // broadcast B to A (case A is parametric with )
+         targetShape[i] = shapeA[i];
+         broadcastFlag |= 1;
+      } else if (shapeA[i].GetVal() == "1" && shapeB[i].isParam) {
+         // broadcast A to B
+         targetShape[i] = shapeB[i];
+         broadcastFlag |= 2;
+      } else if (!shapeA[i].isParam && !shapeB[i].isParam) {
+         if (shapeB[i].dim == 1) {
+            targetShape[i] = shapeA[i];
+            broadcastFlag |= 1;
+         } else if (shapeA[i].dim == 1) {
+            targetShape[i] = shapeB[i];
+            broadcastFlag |= 2;
+         } else {
+            // non broadcastable case cannot have A and B two different defined shapes different than one
+            broadcastFlag = -1;
+         }
+      } else if (shapeA[i].isParam && shapeB[i].isParam) {
+         // full dynamic case - we will decided at run time
+         std::stringstream s;
+         s <<  "std::max(" << shapeA[i] << "," << shapeB[i] << ")";
+         // use -1 for dim to indicate is an expression
+         targetShape[i] = Dim { s.str() , static_cast<size_t>(-1)};
+         broadcastFlag |= 4;
+      } else if (shapeA[i].isParam && !shapeB[i].isParam) {
+         // A -> B need to check at run time if consistent
+         targetShape[i] = shapeB[i];
+         broadcastFlag |= 6;
+      } else if (!shapeA[i].isParam && shapeB[i].isParam) {
+         // B -> A need to check at run time if consistent
+         targetShape[i] = shapeA[i];
+         broadcastFlag |= 5;
+      } else {
+         // all cases should be covered
+         throw std::runtime_error("TMVA::SOFIE - Fatal error in MultiDirectionalBroadCastDimShape");
+      }
+   }
+   if (broadcastFlag == -1) {
+      throw std::runtime_error("TMVA::SOFIE - Error multidirectional broadcasting tensors of shape " +
+                                 ConvertDimShapeToString(shapeA) + " and " + ConvertDimShapeToString(shapeB) +
+                                 " to a common shape.");
+   }
+
+   return std::make_pair(broadcastFlag, targetShape);
 }
 
 std::string UTILITY::Clean_name(std::string input_tensor_name){
@@ -413,15 +524,146 @@ std::vector<Dim> UTILITY::ComputeStrideFromShape(const std::vector<Dim> & shape)
    // assume row major layout
    const auto size = shape.size();
    std::vector<Dim> strides(size);
-   strides[size-1] = Dim{1};
-   for (std::size_t i = 1; i < size; i++) {
-      if (!shape[size-i].isParam && !strides[size-i].isParam)
-         strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim};
-      else
-         strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())};
+   if (size > 0) {
+      strides[size-1] = Dim{1};
+      for (std::size_t i = 1; i < size; i++) {
+         if (!shape[size-i].isParam && !strides[size-i].isParam)
+            strides[size - 1 - i] = Dim{strides[size-i].dim * shape[size-i].dim};
+         else {
+            if (strides[size-i].GetVal() == "1")
+               strides[size - 1 - i] = shape[size-i];
+            else if (shape[size-i].GetVal() == "1")
+               strides[size - 1 - i] = strides[size-i];
+            else
+              strides[size - 1 - i] = Dim{std::string(strides[size-i].GetVal() + "*" + shape[size-i].GetVal())};
+         }
+      }
    }
    return strides;
 }
 
+struct FreeBlock {
+  std::size_t offset;
+  std::size_t size;
+  bool operator<(const FreeBlock& other) const {
+    // order by offset for deterministic coalescing
+    return offset < other.offset;
+  }
+};
+
+struct MemoryEvent {
+  int t;      // time (i.e. operator index)
+  int type;   // 0 = END first, 1 = START
+  int idx;    // tensor index
+  bool operator<(const MemoryEvent& o) const {
+    if (t != o.t) return t < o.t;
+    return type < o.type; // END before START at the same time
+  }
+};
+
+/// Greedy best-fit planner with coalescing free list.
+MemoryResult OrganizeMemory(const std::vector<TensorLifeInfo> & tensorsInfo )
+{
+   // Basic validation
+   for (const auto &t : tensorsInfo) {
+      if (!(t.end > t.begin)) {
+         throw std::runtime_error("Each tensor must have end > begin.");
+      }
+   }
+
+   // Build events: free before allocate at equal times.
+   std::vector<MemoryEvent> events;
+   events.reserve(tensorsInfo.size() * 2);
+   for (int i = 0; i < (int)tensorsInfo.size(); ++i) {
+      events.push_back({tensorsInfo[i].end, 0, i});   // END
+      events.push_back({tensorsInfo[i].begin, 1, i}); // START
+   }
+   std::sort(events.begin(), events.end());
+
+   std::vector<size_t> tensorsOffset(tensorsInfo.size());
+
+   // Free list ordered by offset (for O(log n) coalescing)
+   // and faster insert/erase with respect to a vector
+   std::set<FreeBlock> free_list;
+
+   // Bookkeeping: size/offset map for frees.
+   std::unordered_map<int, std::size_t> live_size;
+   std::unordered_map<int, std::size_t> live_offset;
+
+   std::size_t total_bytes = 0;
+
+   auto allocate_best_fit = [&](std::size_t need) -> std::size_t {
+      // Find the *smallest* block whose size >= need (best-fit).
+      // Since free_list is ordered by offset, we scan to find best by size.
+      // (For very large sets you could maintain a multimap by size as well.)
+      auto best = free_list.end();
+      for (auto it = free_list.begin(); it != free_list.end(); ++it) {
+         if (it->size >= need) {
+            if (best == free_list.end() || it->size < best->size)
+               best = it;
+         }
+      }
+      if (best != free_list.end()) {
+         std::size_t off = best->offset;
+         if (best->size == need) {
+            free_list.erase(best);
+         } else {
+            FreeBlock updated{best->offset + need, best->size - need};
+            free_list.erase(best);
+            free_list.insert(updated);
+         }
+         return off;
+      }
+      // No free block large enough; grow the heap.
+      std::size_t off = total_bytes;
+      total_bytes += need;
+      return off;
+   };
+
+   auto try_coalesce = [&](std::set<FreeBlock>::iterator it) {
+      // Coalesce with previous
+      if (it != free_list.begin()) {
+         auto prev = std::prev(it);
+         if (prev->offset + prev->size == it->offset) {
+            FreeBlock merged{prev->offset, prev->size + it->size};
+            free_list.erase(prev);
+            it = free_list.erase(it);
+            it = free_list.insert(merged).first;
+         }
+      }
+      // Coalesce with next
+      auto next = std::next(it);
+      if (next != free_list.end() && it->offset + it->size == next->offset) {
+         FreeBlock merged{it->offset, it->size + next->size};
+         free_list.erase(next);
+         it = free_list.erase(it);
+         free_list.insert(merged);
+      }
+   };
+
+   // Sweep through time.
+   for (const auto &e : events) {
+      if (e.type == 0) { // END: free
+         auto it_sz = live_size.find(e.idx);
+         auto it_off = live_offset.find(e.idx);
+         if (it_sz != live_size.end() && it_off != live_offset.end()) {
+            FreeBlock fb{it_off->second, it_sz->second};
+            // Insert and coalesce with neighbors
+            auto it = free_list.insert(fb).first;
+            try_coalesce(it);
+            live_size.erase(it_sz);
+            live_offset.erase(it_off);
+         }
+      } else { // START: allocate
+         auto &t = tensorsInfo[e.idx];
+         std::size_t off = allocate_best_fit(t.size);
+         tensorsOffset[e.idx] = off;
+         live_size[e.idx] = t.size;
+         live_offset[e.idx] = off;
+      }
+   }
+
+   return MemoryResult{total_bytes, std::move(tensorsOffset)};
+}
 
-}//SOFIE
+} // namespace SOFIE
\ No newline at end of file
diff --git a/src/SOFIE_core/test/CMakeLists.txt b/src/SOFIE_core/test/CMakeLists.txt
index 34bb49f..fd848df 100644
--- a/src/SOFIE_core/test/CMakeLists.txt
+++ b/src/SOFIE_core/test/CMakeLists.txt
@@ -1,131 +1,191 @@
-# Copyright (C) 1995-2021, Rene Brun and Fons Rademakers.
-# All rights reserved.
-#
-# For the licensing terms see $ROOTSYS/LICENSE.
-# For the list of contributors see $ROOTSYS/README/CREDITS.
+cmake_minimum_required(VERSION 3.14)
+include(FetchContent)
 
 ############################################################################
-# CMakeLists.txt file for building TMVA SOFIE tests.
-# @author Federico Sossai, Sanjiban Sengupta
+# Basic setup
 ############################################################################
-
 include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_core/inc)
 include_directories(${CMAKE_SOURCE_DIR}/src/SOFIE_parsers/inc)
 
+set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
 if (NOT ONNX_MODELS_DIR)
   set(ONNX_MODELS_DIR input_models)
 endif()
 
-# Finding .onnx files to be parsed and creating the appropriate code to
-# parse all file. It is much faster to combine all parsing in a single executable
-# which will avoid initialization time (especially when using ROOT)
-set(CAPTURE_STR "EmitModel( \"@1\", \"@2\");")
-set(ALL_CAPTURES "")
-# Finding .onnx files to be parsed and creating the appropriate command
+option(ENABLE_ALPAKA_TESTS "Enable Alpaka-based SOFIE tests" OFF)
+
+set(ALPAKA_BACKEND "cuda"
+    CACHE STRING "Alpaka backend to test (cuda, cpu, hip, sycl)")
+set_property(CACHE ALPAKA_BACKEND PROPERTY STRINGS cuda cpu hip sycl)
+
+############################################################################
+# Generate emitter sources
+############################################################################
+set(CAPTURE_STR
+"try {\n\
+    EmitModel(\"@1\", \"@2\");\n\
+} catch (const std::exception& e) {\n\
+    std::string msg = e.what();\n\
+    if (msg.find(\"multiple output tensors are not supported\") != std::string::npos) {\n\
+        std::cerr << \"[SKIP] Multiple outputs are not supported for @1\" << std::endl;\n\
+    } else if (msg.find(\"is of a data type which is not yet supported\") != std::string::npos) {\n\
+        std::cerr << \"[SKIP] Operatorr with nsupported data type in @1: \" << msg << std::endl;\n\
+    } else {\n\
+        std::cerr << \"[ERROR] Failed processing @1: \" << msg << std::endl;\n\
+        failures++;\n\
+    }\n\
+} catch (...) {\n\
+    std::cerr << \"[ERROR] Unknown failure processing @1\" << std::endl;\n\
+    failures++;\n\
+}\n\
+")
+
 file(GLOB ONNX_FILES "${ONNX_MODELS_DIR}/*.onnx")
+
+set(ALL_CAPTURES "")
 foreach(onnx_file ${ONNX_FILES})
   get_filename_component(fname ${onnx_file} NAME_WE)
-  get_filename_component(fdir ${onnx_file} DIRECTORY)
-  string(REPLACE "@1" ${onnx_file} cap ${CAPTURE_STR})
-  string(REPLACE "@2" ${fname} cap ${cap})
-  list(APPEND ALL_CAPTURES ${cap})
+  string(REPLACE "@1" "${onnx_file}" cap "${CAPTURE_STR}")
+  string(REPLACE "@2" "${fname}" cap "${cap}")
+  string(APPEND ALL_CAPTURES "${cap}")
 endforeach()
-string(REPLACE ";" ";\n" EMIT_CAPTURES "${ALL_CAPTURES}")
+
+set(EMIT_CAPTURES "${ALL_CAPTURES}")
+
 configure_file(EmitFromONNX.cxx.in EmitFromONNX_all.cxx @ONLY)
-configure_file(EmitFromRoot.cxx.in EmitFromRoot_all.cxx @ONLY)
+configure_file(EmitFromONNX_GPU_ALPAKA.cxx.in EmitFromONNX_GPU_ALPAKA_all.cxx @ONLY)
+
+############################################################################
+# Alpaka tests
+############################################################################
+if (ENABLE_ALPAKA_TESTS)
+
+  string(TOLOWER "${ALPAKA_BACKEND}" _alpaka_backend)
+  if (NOT _alpaka_backend IN_LIST ALPAKA_BACKEND)
+    message(FATAL_ERROR "Unsupported ALPAKA_BACKEND=${ALPAKA_BACKEND}")
+  endif()
+
+  FetchContent_Declare(
+    sofieBLAS
+    GIT_REPOSITORY https://github.com/ML4EP/sofieBLAS
+    GIT_TAG        dev
+  )
+  FetchContent_MakeAvailable(sofieBLAS)
 
-ROOTTEST_GENERATE_EXECUTABLE(emitFromONNX EmitFromONNX_all.cxx
+  FetchContent_Declare(
+    alpaka
+    GIT_REPOSITORY https://github.com/alpaka-group/alpaka
+    GIT_TAG        2fa91a34ed11b2076e474c5507d920e85cf9b79d
+  )
+  FetchContent_MakeAvailable(alpaka)
+
+  ##########################################################################
+  # Alpaka emitter
+  ##########################################################################
+  ROOTTEST_GENERATE_EXECUTABLE(
+    emitFromONNXAlpaka
+    EmitFromONNX_GPU_ALPAKA_all.cxx
     LIBRARIES protobuf::libprotobuf SOFIE_core SOFIE_parsers
-                             FIXTURES_SETUP sofie-compile-models-onnx-build)
-
-# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17
-target_compile_options(emitFromONNX PRIVATE -Wno-unused-parameter -Wno-array-bounds)
-
-ROOTTEST_ADD_TEST(SofieCompileModels_ONNX
-  COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNX ${onnx_file} ${CMAKE_CURRENT_BINARY_DIR}/${fname}
-  FIXTURES_REQUIRED sofie-compile-models-onnx-build
-  FIXTURES_SETUP sofie-compile-models-onnx
-)
-
-# Creating a Google Test
-if (BLAS_FOUND)  # we need BLAS for compiling the models
-  ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromONNX TestCustomModelsFromONNX.cxx
-    LIBRARIES
-      MathCore
-      SOFIE_core
-      BLAS::BLAS
-      GTest::gtest
-      GTest::gtest_main
-    FIXTURES_REQUIRED
-      sofie-compile-models-onnx
-    FIXTURES_SETUP
-      sofie-test-models-onnx-build
+    FIXTURES_SETUP sofie-compile-models-onnx-alpaka-build
   )
-  target_include_directories(TestCustomModelsFromONNX PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-  ROOTTEST_ADD_TEST(TestCustomModelsFromONNX
-                    EXEC ./TestCustomModelsFromONNX
-                    FIXTURES_REQUIRED sofie-test-models-onnx-build)
-endif()
 
-# For testing serialisation of RModel object
-
-ROOTTEST_GENERATE_EXECUTABLE(emitFromROOT EmitFromRoot_all.cxx
-    LIBRARIES protobuf::libprotobuf RIO SOFIE_core SOFIE_parsers
-    FIXTURES_SETUP sofie-compile-models-onnx-root
-)
-# silence protobuf warnings seen in version 3.0 and 3.6. Not needed from protobuf version 3.17
-target_compile_options(emitFromROOT PRIVATE -Wno-unused-parameter -Wno-array-bounds)
-
-# Automatic compilation of headers from root files
-ROOTTEST_ADD_TEST(SofieCompileModels_ROOT
-  COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromROOT
-  FIXTURES_REQUIRED sofie-compile-models-onnx-root
-  FIXTURES_SETUP sofie-compile-models-root
-)
-
-if (BLAS_FOUND)
-  # Creating a Google Test for Serialisation of RModel
-  ROOTTEST_GENERATE_EXECUTABLE(TestCustomModelsFromROOT TestCustomModelsFromROOT.cxx
-    LIBRARIES
-      SOFIE_core
-      BLAS::BLAS
-      GTest::gtest
-      GTest::gtest_main
-    FIXTURES_REQUIRED
-      sofie-compile-models-root
-    FIXTURES_SETUP
-      sofie-test-models-root-build
+  target_compile_options(emitFromONNXAlpaka PRIVATE
+    -Wno-unused-parameter
+    -Wno-array-bounds
   )
-  target_include_directories(TestCustomModelsFromROOT PRIVATE ${CMAKE_CURRENT_BINARY_DIR})
-  ROOTTEST_ADD_TEST(TestCustomModelsFromROOT
-                    EXEC ./TestCustomModelsFromROOT
-                    FIXTURES_REQUIRED sofie-test-models-root-build)
-endif()
 
-# Look for needed Python modules
-ROOT_FIND_PYTHON_MODULE(torch)
-if (ROOT_TORCH_FOUND)
-  configure_file(Conv1dModelGenerator.py  Conv1dModelGenerator.py COPYONLY)
-  configure_file(Conv2dModelGenerator.py  Conv2dModelGenerator.py COPYONLY)
-  configure_file(Conv3dModelGenerator.py  Conv3dModelGenerator.py COPYONLY)
-  configure_file(ConvTrans2dModelGenerator.py  ConvTrans2dModelGenerator.py COPYONLY)
-  configure_file(LinearModelGenerator.py  LinearModelGenerator.py COPYONLY)
-  configure_file(RecurrentModelGenerator.py  RecurrentModelGenerator.py COPYONLY)
-
-  if (BLAS_FOUND)
-    ROOT_ADD_GTEST(TestSofieModels TestSofieModels.cxx
-      LIBRARIES
-        SOFIE_core
-        SOFIE_parsers
-        BLAS::BLAS
-      INCLUDE_DIRS
-        ${CMAKE_CURRENT_BINARY_DIR}
+  ROOTTEST_ADD_TEST(
+    SofieCompileModels_ONNX_Alpaka
+    COMMAND ${CMAKE_COMMAND} -E env ROOTIGNOREPREFIX=1 ./emitFromONNXAlpaka
+    FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka-build
+    FIXTURES_SETUP sofie-compile-models-onnx-alpaka
+  )
+
+  ##########################################################################
+  # CUDA backend
+  ##########################################################################
+  if (_alpaka_backend STREQUAL "cuda")
+
+    message(STATUS "Enabling Alpaka CUDA tests")
+
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+
+    set_source_files_properties(
+      TestCustomModelsFromONNXForAlpakaCuda.cxx
+      PROPERTIES LANGUAGE CUDA
+    )
+
+    ROOTTEST_GENERATE_EXECUTABLE(
+      TestCustomModelsFromONNXForAlpakaCuda
+      TestCustomModelsFromONNXForAlpakaCuda.cxx
+      LIBRARIES MathCore SOFIE_core GTest::gtest GTest::gtest_main
+      FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka
+      FIXTURES_SETUP sofie-test-models-onnx-alpaka-build
     )
-  endif()
-endif()
 
-ROOT_EXECUTABLE(emitGNN GNN/EmitGNN.cxx LIBRARIES SOFIE_core)
-ROOT_ADD_TEST(tmva-sofie-EmitGNN COMMAND emitGNN)
+    target_include_directories(
+      TestCustomModelsFromONNXForAlpakaCuda PRIVATE
+      ${CMAKE_CURRENT_BINARY_DIR}
+      ${alpaka_SOURCE_DIR}/include
+      ${sofieblas_SOURCE_DIR}/include
+      ${ROOT_INCLUDE_DIRS}
+      ${CUDAToolkit_INCLUDE_DIRS}
+      ${CMAKE_CURRENT_SOURCE_DIR}
+    )
+
+    set_target_properties(
+      TestCustomModelsFromONNXForAlpakaCuda
+      PROPERTIES
+        CUDA_SEPARABLE_COMPILATION OFF
+        CUDA_ARCHITECTURES 70 80 86 
+    )
+
+    target_compile_definitions(
+      TestCustomModelsFromONNXForAlpakaCuda PRIVATE
+      ALPAKA_ACC_GPU_CUDA_ENABLED
+      ALPAKA_HAS_STD_ATOMIC_REF
+    )
+
+    target_compile_options(
+      TestCustomModelsFromONNXForAlpakaCuda PRIVATE
+      $<$<COMPILE_LANGUAGE:CUDA>:
+        --extended-lambda
+        --expt-relaxed-constexpr
+        --generate-line-info
+        --use_fast_math
+        -g
+        -G
+        # -fsanitize=address
+        -O1
+        -Wno-deprecated-gpu-targets
+      >
+      $<$<COMPILE_LANGUAGE:CXX>:
+        -O2
+        -g
+        -G
+        -fPIC
+        -pthread
+      >
+    )
+  # set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fsanitize=address")
+
+    # ROOT-compatible: plain signature only
+    target_link_libraries(
+      TestCustomModelsFromONNXForAlpakaCuda
+      CUDA::cudart
+      CUDA::cublas
+      CUDA::cublasLt
+      ${ROOT_LIBRARIES}
+    )
+
+    ROOTTEST_ADD_TEST(
+      TestCustomModelsFromONNXForAlpakaCuda
+      EXEC ./TestCustomModelsFromONNXForAlpakaCuda
+      FIXTURES_REQUIRED sofie-compile-models-onnx-alpaka
+    )
 
-ROOT_EXECUTABLE(EmitGraphIndependent GNN/EmitGraphIndependent.cxx LIBRARIES SOFIE_core)
-ROOT_ADD_TEST(tmva-sofie-EmitGraphIndependent COMMAND EmitGraphIndependent)
+  endif() # cuda backend
+endif()   # ENABLE_ALPAKA_TESTS
diff --git a/src/SOFIE_core/test/EmitFromONNX.cxx.in b/src/SOFIE_core/test/EmitFromONNX.cxx.in
index f7a56e2..c464f4d 100644
--- a/src/SOFIE_core/test/EmitFromONNX.cxx.in
+++ b/src/SOFIE_core/test/EmitFromONNX.cxx.in
@@ -23,7 +23,13 @@ int EmitModel(std::string filename, std::string outname) {
 
 int main(int argc, char *argv[]){
 
-@EMIT_CAPTURES@ ;
+
+    int failures = 0;
+
+    @EMIT_CAPTURES@
+
+    std::cout << "[SUMMARY for generation from ONNX] Completed with " << failures << " failures" << std::endl;
+    return failures == 0 ? 0 : 1;
 
 }
 
diff --git a/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in b/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in
new file mode 100644
index 0000000..58198c1
--- /dev/null
+++ b/src/SOFIE_core/test/EmitFromONNX_GPU_ALPAKA.cxx.in
@@ -0,0 +1,27 @@
+// Author: Sanjiban Sengupta
+
+#include "SOFIE/RModel_Base.hxx"
+#include "SOFIE/RModel.hxx"
+#include "SOFIE/RModelParser_ONNX.hxx"
+
+using namespace SOFIE;
+
+int EmitModel(std::string filename, std::string outname) {
+
+   RModelParser_ONNX parser;
+   RModel model = parser.Parse(filename);
+   model.GenerateGPU_ALPAKA();
+   model.OutputGenerated(outname+"_FromONNX_GPU_ALPAKA.hxx");
+
+   return 0;
+}
+
+int main(int argc, char *argv[]) {
+
+    int failures = 0;
+
+    @EMIT_CAPTURES@
+
+    std::cout << "[SUMMARY for generation from ONNX with ALPAKA] Completed with " << failures << " failures" << std::endl;
+    return failures == 0 ? 0 : 1;
+}
diff --git a/src/SOFIE_core/test/EmitFromRoot.cxx.in b/src/SOFIE_core/test/EmitFromRoot.cxx.in
index 4a630c7..88c0789 100644
--- a/src/SOFIE_core/test/EmitFromRoot.cxx.in
+++ b/src/SOFIE_core/test/EmitFromRoot.cxx.in
@@ -43,6 +43,15 @@ int EmitModel(std::string inputfile, std::string outname){
 
 int main(int argc, char *argv[]){
 
-@EMIT_CAPTURES@ ;
+    int failures = 0;
 
+    @EMIT_CAPTURES@
+
+    std::cout << "[SUMMARY for generation from ROOT] Completed with " << failures << " failures" << std::endl;
+    return failures == 0 ? 0 : 1;
+
+   @EMIT_CAPTURES@;
+
+   std::cout << "[SUMMARY] Completed with " << failures << " failures" << std::endl;
+   return failures == 0 ? 0 : 1;
 }
diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx b/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx
index d02dc5e..14eb6a3 100644
--- a/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx
+++ b/src/SOFIE_core/test/TestCustomModelsFromONNX.cxx
@@ -812,7 +812,7 @@ TEST(ONNX, LinearWithLeakyRelu)
 {
    constexpr float TOLERANCE = 1;
 
-   // Preparing the standard all-ones input
+   // Preparing input
    std::vector<float> input({
       0.4369, -0.6882,  1.0309, -1.0263, -0.1519,  1.2237, -0.7054, -0.1762,
       -0.6811, -2.2597,  1.0388, -0.7993,  0.1468,  1.3257, -0.4714, -0.0958,
@@ -2515,7 +2515,7 @@ TEST(ONNX, Equal){
    });
 
    SOFIE_Equal::Session s("Equal_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(Equal_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2540,7 +2540,7 @@ TEST(ONNX, LessOrEqual){
    });
 
    SOFIE_LessOrEqual::Session s("LessOrEqual_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(LessOrEqual_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2565,7 +2565,7 @@ TEST(ONNX, GreaterOrEqual){
    });
 
    SOFIE_GreaterOrEqual::Session s("GreaterOrEqual_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(GreaterOrEqual_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2590,7 +2590,7 @@ TEST(ONNX, Greater){
    });
 
    SOFIE_Greater::Session s("Greater_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(Greater_ExpectedOutput::outputs) / sizeof(bool));
 
@@ -2615,7 +2615,7 @@ TEST(ONNX, Less){
    });
 
    SOFIE_Less::Session s("Less_FromONNX.dat");
-   std::vector<bool> output = s.infer(input1.data(),input2.data());
+   std::vector<std::uint8_t> output = s.infer(input1.data(),input2.data());
    // Checking output size
    EXPECT_EQ(output.size(), sizeof(Less_ExpectedOutput::outputs) / sizeof(bool));
 
diff --git a/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx b/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
new file mode 100644
index 0000000..1303251
--- /dev/null
+++ b/src/SOFIE_core/test/TestCustomModelsFromONNXForAlpakaCuda.cxx
@@ -0,0 +1,1096 @@
+#include <numeric>
+#include <cstddef>
+
+#include "Linear_64_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Linear_64.ref.hxx"
+
+#include "AddBroadcast1_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/AddBroadcast1.ref.hxx"
+
+#include "LinearWithLeakyRelu_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/LinearWithLeakyRelu.ref.hxx"
+
+#include "LinearWithSigmoid_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/LinearWithSigmoid.ref.hxx"
+
+#include "Transpose_FromONNX_GPU_ALPAKA.hxx"
+
+#include "Concat_0D_FromONNX_GPU_ALPAKA.hxx"
+#include "ScatterElements_FromONNX_GPU_ALPAKA.hxx"
+
+#include "Split_0_FromONNX_GPU_ALPAKA.hxx"
+#include "Split_1_FromONNX_GPU_ALPAKA.hxx"
+#include "Split_2_FromONNX_GPU_ALPAKA.hxx"
+
+#include "Tile5D_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/Tile5D.ref.hxx"
+
+#include "GatherAxis0_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherAxis1_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherAxis2_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherAxis3_FromONNX_GPU_ALPAKA.hxx"
+#include "Gather2d_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherNegativeIndices_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/GatherAxis0.ref.hxx"
+#include "input_models/references/GatherAxis1.ref.hxx"
+#include "input_models/references/GatherAxis2.ref.hxx"
+#include "input_models/references/GatherAxis3.ref.hxx"
+#include "input_models/references/Gather2d.ref.hxx"
+#include "input_models/references/GatherNegativeIndices.ref.hxx"
+
+#include "ExpandSameSize_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ExpandSameSize.ref.hxx"
+
+#include "ExpandDiffSize_FromONNX_GPU_ALPAKA.hxx"
+#include "input_models/references/ExpandDiffSize.ref.hxx"
+
+#include "GatherND_Ex1_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_Ex2_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_Ex3_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_Ex4_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_Ex5_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.hxx"
+#include "GatherND_Batch_FromONNX_GPU_ALPAKA.hxx"
+
+#include <alpaka/alpaka.hpp>
+#include <cuda_runtime.h>
+#include <nvml.h>
+#include "gtest/gtest.h"
+
+constexpr float DEFAULT_TOLERANCE = 1e-3f;
+
+using Idx = std::size_t;
+using Dim = alpaka::DimInt<1>;
+using Ext1D = alpaka::Vec<Dim, Idx>;
+
+class SofieAlpakaTest : public ::testing::Test {
+protected:
+    // Shared devices and platforms
+    alpaka::PlatformCpu hostPlatform;
+    alpaka::DevCpu host;
+    alpaka::PlatformCudaRt platform;
+    alpaka::DevCudaRt device;
+    alpaka::Queue<alpaka::DevCudaRt, alpaka::NonBlocking> queue;
+
+    SofieAlpakaTest() 
+        : hostPlatform{}
+        , host(alpaka::getDevByIdx(hostPlatform, 0u))
+        , platform{}
+        , device(alpaka::getDevByIdx(platform, 0u))
+        , queue(device)
+    {
+    }
+
+    void SetUp() override {
+        cudaDeviceSynchronize();
+    }
+
+    void TearDown() override {
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+    }
+
+    ~SofieAlpakaTest() override {
+        cudaDeviceSynchronize();
+    }
+};
+
+
+// TEST_F(SofieAlpakaTest, Linear64)
+// {
+//    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+//    auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{1600}));
+//    float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+//    for (Idx i = 0; i < 1600; ++i) {
+//       A_ptr[i] = 1.0;
+//    }
+
+//    auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{1600}));
+//    alpaka::memcpy(queue, A_d, A);
+//    alpaka::wait(queue);
+
+//    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{160}));
+   
+//    {
+//       SOFIE_Linear_64::Session<alpaka::TagGpuCudaRt> session("Linear_64_FromONNX_GPU_ALPAKA.dat");
+//       auto result = session.infer(A_d);
+//       alpaka::wait(queue);
+//       cudaDeviceSynchronize();
+
+//       alpaka::memcpy(queue, result_h, result);
+//       alpaka::wait(queue);
+//    }
+   
+//    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+//    float *correct = Linear_64_ExpectedOutput::all_ones;
+
+//    for (size_t i = 0; i < 160; ++i) {
+//       EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+//    }
+// }
+
+TEST_F(SofieAlpakaTest, LinearWithLeakyRelu)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input({
+      0.4369, -0.6882,  1.0309, -1.0263, -0.1519,  1.2237, -0.7054, -0.1762,
+      -0.6811, -2.2597,  1.0388, -0.7993,  0.1468,  1.3257, -0.4714, -0.0958,
+      0.7057, -0.3749, -0.3310,  0.0986, -0.1370,  0.0832, -1.6465, -0.2793
+   });
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   for (Idx i = 0; i < input.size(); ++i) {
+      A_ptr[i] = input[i];
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{24}));
+   
+   {
+      SOFIE_LinearWithLeakyRelu::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(A_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+   
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = LinearWithLeakyRelu_ExpectedOutput::outputs;
+
+   for (size_t i = 0; i < 24; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, LinearWithSigmoid)
+{
+
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{48}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   for (Idx i = 0; i < 48; ++i) {
+      A_ptr[i] = 1.0;
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{48}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{24}));
+   
+   {
+      SOFIE_LinearWithSigmoid::Session<alpaka::TagGpuCudaRt> session("LinearWithSigmoid_FromONNX_GPU_ALPAKA.dat");
+      auto result = session.infer(A_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = LinearWithSigmoid_ExpectedOutput::all_ones;
+   for (size_t i = 0; i < 24; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, AddBroadcast1)
+{
+
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   auto A = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{5}));
+   float *A_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(A));
+
+   auto B = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{20}));
+   float *B_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(B));
+
+   std::vector<float> A_vec({-0.78023305, -1.34029483, -3.01482951, 0.53641361,
+                 -1.22594789});
+   std::vector<float> B_vec({1.0626695,  0.43842875,  1.22476468,  0.79763274,  0.98688211,
+                 0.25267614, 0.44874883,  0.31516773,  -0.78771195, 0.64565664,
+                 0.50450593, -0.41265227, -0.22474539, -0.22362374, 0.00509674,
+                 0.16927211, 1.06756969,  -0.81634773, 0.88467744,  0.78902059});
+
+   for (Idx i = 0; i < A_vec.size(); ++i) {
+      A_ptr[i] = A_vec[i];
+   }
+
+   for (Idx i = 0; i < B_vec.size(); ++i) {
+      B_ptr[i] = B_vec[i];
+   }
+
+   auto A_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{5}));
+   alpaka::memcpy(queue, A_d, A);
+   alpaka::wait(queue);
+
+   auto B_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{20}));
+   alpaka::memcpy(queue, B_d, B);
+   alpaka::wait(queue);
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{20}));
+   
+   {
+       SOFIE_AddBroadcast1::Session<alpaka::TagGpuCudaRt> session;
+      auto result = session.infer(A_d, B_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }  
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+   float *correct = AddBroadcast1_ExpectedOutput::output;
+   for (size_t i = 0; i < 20; ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, Transpose)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    // Input shape: (2, 1, 3, 4) -> 24 elements
+    constexpr Idx inputSize = 24;
+    // Output shape: (2, 3, 4, 1) -> 24 elements
+    constexpr Idx outputSize = 24;
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+
+    std::vector<float> input_vec({
+        // shape (2, 1, 3, 4)
+        0.f,  1.f,  2.f,  3.f,
+        4.f,  5.f,  6.f,  7.f,
+        8.f,  9.f, 10.f, 11.f,
+
+       12.f, 13.f, 14.f, 15.f,
+       16.f, 17.f, 18.f, 19.f,
+       20.f, 21.f, 22.f, 23.f
+    });
+
+    for (Idx i = 0; i < inputSize; ++i)
+        input_ptr[i] = input_vec[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Transpose::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    std::vector<float> expected(outputSize);
+    std::vector<size_t> inputShape  = {2, 1, 3, 4};
+    std::vector<size_t> perm        = {0, 2, 3, 1};
+    std::vector<size_t> outputShape = {2, 3, 4, 1};
+
+    std::vector<size_t> inputStrides  = {12, 12, 4, 1};
+    std::vector<size_t> outputStrides = {12,  4,  1, 1};
+
+    for (size_t i = 0; i < outputSize; ++i)
+    {
+        size_t remaining = i;
+        size_t inputIdx  = 0;
+        for (size_t d = 0; d < 4; ++d)
+        {
+            size_t const coord = remaining / outputStrides[d];
+            remaining          = remaining - coord * outputStrides[d];
+            inputIdx          += coord * inputStrides[perm[d]];
+        }
+        expected[i] = input_vec[inputIdx];
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - expected[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, Concat0D)
+{
+   constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+   std::vector<float> input({1.40519865e+00, -2.87660856e-01});
+   std::vector<float> expected_output({
+      1.40519865e+00, -2.87660856e-01,
+      1.40519865e+00, -2.87660856e-01
+   });
+
+   // Host input buffer
+   auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+   float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+
+   for (Idx i = 0; i < input.size(); ++i)
+      input_ptr[i] = input[i];
+
+   // Device input buffer
+   auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+   alpaka::memcpy(queue, input_d, input_h);
+   alpaka::wait(queue);
+
+   // Host output buffer
+   auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected_output.size()}));
+
+   {
+      SOFIE_Concat_0D::Session<alpaka::TagGpuCudaRt> session("Concat_0D_FromONNX_GPU_ALPAKA.dat");
+
+      auto result = session.infer(input_d);
+      alpaka::wait(queue);
+      cudaDeviceSynchronize();
+
+      alpaka::memcpy(queue, result_h, result);
+      alpaka::wait(queue);
+   }
+
+   float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+
+   for (size_t i = 0; i < expected_output.size(); ++i) {
+      EXPECT_LE(std::abs(res_ptr[i] - expected_output[i]), TOLERANCE);
+   }
+}
+
+TEST_F(SofieAlpakaTest, ScatterElements)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float>   input   (9, 0.f);
+    std::vector<int64_t> indices = { 1, 0, 2, 0, 2, 1 };
+    std::vector<float>   updates = { 1.f, 1.1f, 1.2f, 2.f, 2.1f, 2.2f };
+    std::vector<float>   correct = { 2.f, 1.1f, 0.f, 1.f, 0.f, 2.2f, 0.f, 2.1f, 1.2f };
+
+    // Allocate and fill host buffers
+    auto input_h   = alpaka::allocBuf<float,   Idx>(host, Ext1D::all(Idx{input.size()}));
+    auto indices_h = alpaka::allocBuf<int64_t, Idx>(host, Ext1D::all(Idx{indices.size()}));
+    auto updates_h = alpaka::allocBuf<float,   Idx>(host, Ext1D::all(Idx{updates.size()}));
+
+    float*   input_ptr   = reinterpret_cast<float*>  (alpaka::getPtrNative(input_h));
+    int64_t* indices_ptr = reinterpret_cast<int64_t*>(alpaka::getPtrNative(indices_h));
+    float*   updates_ptr = reinterpret_cast<float*>  (alpaka::getPtrNative(updates_h));
+
+    for (Idx i = 0; i < input.size();   ++i) input_ptr[i]   = input[i];
+    for (Idx i = 0; i < indices.size(); ++i) indices_ptr[i] = indices[i];
+    for (Idx i = 0; i < updates.size(); ++i) updates_ptr[i] = updates[i];
+
+    // Allocate device buffers and copy
+    auto input_d   = alpaka::allocBuf<float,   Idx>(device, Ext1D::all(Idx{input.size()}));
+    auto indices_d = alpaka::allocBuf<int64_t, Idx>(device, Ext1D::all(Idx{indices.size()}));
+    auto updates_d = alpaka::allocBuf<float,   Idx>(device, Ext1D::all(Idx{updates.size()}));
+
+    alpaka::memcpy(queue, input_d,   input_h);
+    alpaka::memcpy(queue, indices_d, indices_h);
+    alpaka::memcpy(queue, updates_d, updates_h);
+    alpaka::wait(queue);
+
+    // Host result buffer
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct.size()}));
+
+    {
+        SOFIE_ScatterElements::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d, indices_d, updates_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    EXPECT_EQ(correct.size(), 9u);
+    for (size_t i = 0; i < correct.size(); ++i){
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+    }
+}
+
+TEST_F(SofieAlpakaTest, Split_0)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    // split in axis 0 in 2 tensors {2,2,3} -> {1,2,3} each
+    std::vector<float> input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.};
+    std::vector<std::vector<float>> correct_output = { {1.,2.,3.,4.,5.,6.}, {7.,8.,9.,10.,11.,12.} };
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result0_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[0].size()}));
+    auto result1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[1].size()}));
+
+    {
+        SOFIE_Split_0::Session<alpaka::TagGpuCudaRt> session;
+        auto [result0, result1] = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result0_h, result0);
+        alpaka::memcpy(queue, result1_h, result1);
+        alpaka::wait(queue);
+    }
+
+    float* res0_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result0_h));
+    float* res1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result1_h));
+
+    for (size_t j = 0; j < correct_output[0].size(); ++j)
+        EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE);
+    for (size_t j = 0; j < correct_output[1].size(); ++j)
+        EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, Split_1)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    // split in axis 1 in 2 tensors {2,2,3} -> {2,1,3} each
+    std::vector<float> input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.};
+    std::vector<std::vector<float>> correct_output = { {1.,2.,3.,7.,8.,9.}, {4.,5.,6.,10.,11.,12.} };
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result0_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[0].size()}));
+    auto result1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[1].size()}));
+
+    {
+        SOFIE_Split_1::Session<alpaka::TagGpuCudaRt> session;
+        auto [result0, result1] = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result0_h, result0);
+        alpaka::memcpy(queue, result1_h, result1);
+        alpaka::wait(queue);
+    }
+
+    float* res0_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result0_h));
+    float* res1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result1_h));
+
+    for (size_t j = 0; j < correct_output[0].size(); ++j)
+        EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE);
+    for (size_t j = 0; j < correct_output[1].size(); ++j)
+        EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, Split_2)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    // split in axis 2 in 2 tensors {2,2,3} -> {2,2,2} and {2,2,1}
+    std::vector<float> input {1.,2.,3.,4.,5.,6.,7.,8.,9.,10.,11.,12.};
+    std::vector<std::vector<float>> correct_output = { {1.,2.,4.,5.,7.,8.,10.,11.}, {3.,6.,9.,12.} };
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i) input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    // outputs have different sizes: {2,2,2}=8 and {2,2,1}=4
+    auto result0_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[0].size()}));
+    auto result1_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{correct_output[1].size()}));
+
+    {
+        SOFIE_Split_2::Session<alpaka::TagGpuCudaRt> session;
+        auto [result0, result1] = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result0_h, result0);
+        alpaka::memcpy(queue, result1_h, result1);
+        alpaka::wait(queue);
+    }
+
+    float* res0_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result0_h));
+    float* res1_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result1_h));
+
+    for (size_t j = 0; j < correct_output[0].size(); ++j)
+        EXPECT_LE(std::abs(res0_ptr[j] - correct_output[0][j]), TOLERANCE);
+    for (size_t j = 0; j < correct_output[1].size(); ++j)
+        EXPECT_LE(std::abs(res1_ptr[j] - correct_output[1][j]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, Tile5D)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input_data({
+        0.2386120855808258,   0.5549510717391968,   -1.8190287351608276,  0.5724563598632812,   -0.6596977710723877,
+        0.17560836672782898,  0.7608169317245483,    0.08603227883577347, -0.049375515431165695,  0.2705111503601074,
+        1.42119562625885,     0.032626643776893616, -1.212586522102356,   -0.5129594802856445,   -0.43296414613723755,
+       -0.1606937050819397,   1.1884371042251587,   -0.662174642086029,   -2.291109323501587,    -0.6852569580078125,
+        2.325223922729492,   -0.19389064610004425,  -0.5784135460853577,  -0.39328137040138245,   0.2831517457962036,
+        0.4496127665042877,  -0.2029038816690445,    0.35477763414382935,  0.4266718924045563,    0.24683749675750732,
+        1.90426504611969,    -0.4861580729484558,    0.9139055013656616,  -0.5031066536903381,    0.9583520293235779,
+       -0.23210509121418,     1.3183971643447876,    1.7042455673217773,  -0.3201166093349457,   -0.14444805681705475,
+       -0.8829464912414551,   1.725736141204834,     0.45657631754875183,  0.4920198321342468,   -1.088847041130066,
+        0.49437597393989563, -0.006085286382585764,  2.475630760192871,    0.12170185893774033,  -0.8953945636749268,
+        1.1430096626281738,   1.3278610706329346,    0.3076854348182678,   0.036237504333257675,  0.05180325731635094,
+        0.2802475392818451,   0.5289335250854492,    0.9356630444526672,   0.7863689064979553,    0.4239695370197296,
+        0.8723016977310181,  -0.2248474359512329,    0.3891502320766449,   0.5463842153549194,   -0.7782878875732422,
+       -0.8570080399513245,  -2.593783378601074,    -0.11392943561077118,  0.5637082457542419,    2.075004816055298,
+       -1.0598397254943848,   1.0823975801467896
+    });
+
+    const std::size_t inputSize  = input_data.size();
+    const std::size_t outputSize = sizeof(Tile5D_ExpectedOutput::output) / sizeof(float);
+
+    // Allocate and fill host input buffer
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < inputSize; ++i)
+        input_ptr[i] = input_data[i];
+
+    // Copy to device
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    // Host result buffer
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Tile5D::Session<alpaka::TagGpuCudaRt> session;
+        auto result = session.infer(input_d);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr   = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct   = Tile5D_ExpectedOutput::output;
+
+    EXPECT_EQ(outputSize, sizeof(Tile5D_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherAxis0)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 120;
+    const std::size_t outputSize = sizeof(GatherAxis0_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GatherAxis0::Session<alpaka::TagGpuCudaRt> session("GatherAxis0_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = GatherAxis0_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(GatherAxis0_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherAxis1)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 120;
+    const std::size_t outputSize = sizeof(GatherAxis1_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GatherAxis1::Session<alpaka::TagGpuCudaRt> session("GatherAxis1_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = GatherAxis1_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(GatherAxis1_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherAxis2)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 120;
+    const std::size_t outputSize = sizeof(GatherAxis2_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GatherAxis2::Session<alpaka::TagGpuCudaRt> session("GatherAxis2_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = GatherAxis2_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(GatherAxis2_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherAxis3)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 120;
+    const std::size_t outputSize = sizeof(GatherAxis3_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GatherAxis3::Session<alpaka::TagGpuCudaRt> session("GatherAxis3_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = GatherAxis3_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(GatherAxis3_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, Gather2d)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 9;
+    const std::size_t outputSize = sizeof(Gather2d_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_Gather2d::Session<alpaka::TagGpuCudaRt> session("Gather2d_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = Gather2d_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(Gather2d_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherNegativeIndices)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    constexpr Idx inputSize  = 10;
+    const std::size_t outputSize = sizeof(GatherNegativeIndices_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{inputSize}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    std::iota(input_ptr, input_ptr + inputSize, 0.f);
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{inputSize}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_GatherNegativeIndices::Session<alpaka::TagGpuCudaRt> session("GatherNegativeIndices_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = GatherNegativeIndices_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(GatherNegativeIndices_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, ExpandSameSize)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({0.f, 1.f, 2.f});
+    const std::size_t outputSize = sizeof(ExpandSameSize_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i)
+        input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_ExpandSameSize::Session<alpaka::TagGpuCudaRt> session("ExpandSameSize_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = ExpandSameSize_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(ExpandSameSize_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, ExpandDiffSize)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> input({0.f, 1.f, 2.f});
+    const std::size_t outputSize = sizeof(ExpandDiffSize_ExpectedOutput::output) / sizeof(float);
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{input.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < input.size(); ++i)
+        input_ptr[i] = input[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{input.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{outputSize}));
+
+    {
+        SOFIE_ExpandDiffSize::Session<alpaka::TagGpuCudaRt> session("ExpandDiffSize_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    float* correct = ExpandDiffSize_ExpectedOutput::output;
+    EXPECT_EQ(outputSize, sizeof(ExpandDiffSize_ExpectedOutput::output) / sizeof(float));
+    for (size_t i = 0; i < outputSize; ++i)
+        EXPECT_LE(std::abs(res_ptr[i] - correct[i]), TOLERANCE);
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Ex1)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f, 1.f, 2.f, 3.f};
+    std::vector<float> expected = {0.f, 3.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Ex1::Session<alpaka::TagGpuCudaRt> session("GatherND_Ex1_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 2u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Ex2)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f, 1.f, 2.f, 3.f};
+    std::vector<float> expected = {2.f, 3.f, 0.f, 1.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Ex2::Session<alpaka::TagGpuCudaRt> session("GatherND_Ex2_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 4u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Ex3)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f};
+    std::vector<float> expected = {2.f, 3.f, 4.f, 5.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Ex3::Session<alpaka::TagGpuCudaRt> session("GatherND_Ex3_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 4u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Ex4)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f};
+    std::vector<float> expected = {2.f, 3.f, 4.f, 5.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Ex4::Session<alpaka::TagGpuCudaRt> session("GatherND_Ex4_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 4u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Ex5)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f};
+    std::vector<float> expected = {2.f, 3.f, 4.f, 5.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Ex5::Session<alpaka::TagGpuCudaRt> session("GatherND_Ex5_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 4u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_NegativeIndices)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data     = {0.f,1.f,2.f,3.f,4.f,5.f,6.f,7.f,8.f};
+    std::vector<float> expected = {6.f, 2.f, 4.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_NegativeIndices::Session<alpaka::TagGpuCudaRt> session("GatherND_NegativeIndices_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 3u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
+
+TEST_F(SofieAlpakaTest, GatherND_Batch)
+{
+    constexpr float TOLERANCE = DEFAULT_TOLERANCE;
+
+    std::vector<float> data(24);
+    std::iota(data.begin(), data.end(), 0.f);
+    std::vector<float> expected = {4.f,5.f,6.f,7.f, 20.f,21.f,22.f,23.f};
+
+    auto input_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{data.size()}));
+    float* input_ptr = reinterpret_cast<float*>(alpaka::getPtrNative(input_h));
+    for (Idx i = 0; i < data.size(); ++i) input_ptr[i] = data[i];
+
+    auto input_d = alpaka::allocBuf<float, Idx>(device, Ext1D::all(Idx{data.size()}));
+    alpaka::memcpy(queue, input_d, input_h);
+    alpaka::wait(queue);
+
+    auto result_h = alpaka::allocBuf<float, Idx>(host, Ext1D::all(Idx{expected.size()}));
+
+    {
+        SOFIE_GatherND_Batch::Session<alpaka::TagGpuCudaRt> session("GatherND_Batch_FromONNX_GPU_ALPAKA.dat");
+        auto result = session.infer(input_d);
+        alpaka::wait(queue);
+        cudaDeviceSynchronize();
+        alpaka::memcpy(queue, result_h, result);
+        alpaka::wait(queue);
+    }
+
+    float* res = reinterpret_cast<float*>(alpaka::getPtrNative(result_h));
+    ASSERT_EQ(expected.size(), 8u);
+    for (size_t i = 0; i < expected.size(); ++i)
+        EXPECT_LE(std::abs(res[i] - expected[i]), TOLERANCE) << "i=" << i;
+}
diff --git a/src/SOFIE_core/test/input_models/GNN_model.onnx b/src/SOFIE_core/test/input_models/GNN_model.onnx
new file mode 100644
index 0000000..833e34d
Binary files /dev/null and b/src/SOFIE_core/test/input_models/GNN_model.onnx differ
diff --git a/src/SOFIE_core/test/input_models/GatherND_Batch.onnx b/src/SOFIE_core/test/input_models/GatherND_Batch.onnx
new file mode 100644
index 0000000..4d146c6
Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Batch.onnx differ
diff --git a/src/SOFIE_core/test/input_models/GatherND_Ex1.onnx b/src/SOFIE_core/test/input_models/GatherND_Ex1.onnx
new file mode 100644
index 0000000..bc1a910
Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Ex1.onnx differ
diff --git a/src/SOFIE_core/test/input_models/GatherND_Ex2.onnx b/src/SOFIE_core/test/input_models/GatherND_Ex2.onnx
new file mode 100644
index 0000000..4cd511c
Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Ex2.onnx differ
diff --git a/src/SOFIE_core/test/input_models/GatherND_Ex3.onnx b/src/SOFIE_core/test/input_models/GatherND_Ex3.onnx
new file mode 100644
index 0000000..917008f
Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Ex3.onnx differ
diff --git a/src/SOFIE_core/test/input_models/GatherND_Ex4.onnx b/src/SOFIE_core/test/input_models/GatherND_Ex4.onnx
new file mode 100644
index 0000000..d3006a2
Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Ex4.onnx differ
diff --git a/src/SOFIE_core/test/input_models/GatherND_Ex5.onnx b/src/SOFIE_core/test/input_models/GatherND_Ex5.onnx
new file mode 100644
index 0000000..be1ba0d
Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_Ex5.onnx differ
diff --git a/src/SOFIE_core/test/input_models/GatherND_NegativeIndices.onnx b/src/SOFIE_core/test/input_models/GatherND_NegativeIndices.onnx
new file mode 100644
index 0000000..5fa05aa
Binary files /dev/null and b/src/SOFIE_core/test/input_models/GatherND_NegativeIndices.onnx differ
diff --git a/src/SOFIE_core/test/input_models/Transpose.onnx b/src/SOFIE_core/test/input_models/Transpose.onnx
new file mode 100644
index 0000000..0e08157
Binary files /dev/null and b/src/SOFIE_core/test/input_models/Transpose.onnx differ
diff --git a/src/SOFIE_parsers/CMakeLists.txt b/src/SOFIE_parsers/CMakeLists.txt
index 379b7d7..0e7e03d 100644
--- a/src/SOFIE_parsers/CMakeLists.txt
+++ b/src/SOFIE_parsers/CMakeLists.txt
@@ -61,6 +61,7 @@ set(sources_cxx
     src/ParseLayerNormalization.cxx
     src/ParseExpand.cxx
     src/ParseGather.cxx
+    src/ParseGatherND.cxx
     src/ParseElu.cxx
     src/ParseFuseConvAdd.cxx
     src/ParseFuseConvTransposeAdd.cxx
@@ -102,6 +103,15 @@ target_include_directories(SOFIE_parsers PUBLIC
   set_target_properties(SOFIE_parsers PROPERTIES
   POSITION_INDEPENDENT_CODE TRUE)
 
+  ROOT_GENERATE_DICTIONARY(G__SOFIE_parsers ${sources_headers}
+    LINKDEF inc/LinkDef.h
+    MODULE SOFIE_parsers
+    OPTIONS --deep
+)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers_rdict.pcm
+              ${CMAKE_CURRENT_BINARY_DIR}/libSOFIE_parsers.rootmap
+        DESTINATION lib)
+
 install(TARGETS SOFIE_parsers
         LIBRARY DESTINATION lib
 )
diff --git a/src/SOFIE_parsers/src/ParseGatherND.cxx b/src/SOFIE_parsers/src/ParseGatherND.cxx
new file mode 100644
index 0000000..57beb01
--- /dev/null
+++ b/src/SOFIE_parsers/src/ParseGatherND.cxx
@@ -0,0 +1,49 @@
+#include "SOFIE/RModelParser_ONNX.hxx"
+#include "SOFIE/ROperator_GatherND.hxx"
+#include "onnx_proto3.pb.h"
+#include <stdexcept>
+
+
+namespace SOFIE {
+
+ParserFuncSignature ParseGatherND = [](RModelParser_ONNX &parser, const onnx::NodeProto &nodeproto) {
+   ETensorType input_type = ETensorType::UNDEFINED;
+   auto input_name = nodeproto.input(0);
+   if (parser.IsRegisteredTensorType(input_name)) {
+      input_type = parser.GetTensorType(input_name);
+   } else {
+      throw std::runtime_error("TMVA::SOFIE ONNX Parser GatherND op has input tensor " + input_name +
+                               " but its type is not yet registered");
+   }
+
+   auto indices_name = nodeproto.input(1);
+   if (parser.IsRegisteredTensorType(indices_name)) {
+      ETensorType indices_type = parser.GetTensorType(indices_name);
+      if (indices_type != ETensorType::INT64) {
+         throw std::runtime_error("TMVA::SOFIE ONNX Parser GatherND op indices tensor must be INT64, got " +
+                                  indices_name);
+      }
+   }
+
+   int64_t batch_dims = 0;
+   for (int i = 0; i < nodeproto.attribute_size(); ++i) {
+      const auto& attr = nodeproto.attribute(i);
+      if (attr.name() == "batch_dims") {
+         batch_dims = attr.i();
+         break;
+      }
+   }
+
+   std::string output_name = nodeproto.output(0);
+
+   std::unique_ptr<ROperator> op(
+      new ROperator_GatherND(batch_dims, input_name, indices_name, output_name));
+
+   if (!parser.IsRegisteredTensorType(output_name)) {
+      parser.RegisterTensorType(output_name, input_type);
+   }
+
+   return op;
+};
+
+} // namespace SOFIE
diff --git a/src/SOFIE_parsers/src/ParseTile.cxx b/src/SOFIE_parsers/src/ParseTile.cxx
index 20dbfb6..8b8c47f 100644
--- a/src/SOFIE_parsers/src/ParseTile.cxx
+++ b/src/SOFIE_parsers/src/ParseTile.cxx
@@ -29,6 +29,7 @@ ParserFuncSignature ParseTile = [](RModelParser_ONNX &parser, const onnx::NodePr
 
    switch (input_type) {
    case ETensorType::FLOAT: op.reset(new ROperator_Tile<float>(repeat_name, input_name, output_name)); break;
+   case ETensorType::INT64: op.reset(new ROperator_Tile<int64_t>(repeat_name, input_name, output_name)); break;
    default:
       throw std::runtime_error("TMVA::SOFIE - Unsupported - Operator Tile does not yet support input type " +
                                std::to_string(static_cast<int>(input_type)));
diff --git a/src/SOFIE_parsers/src/RModelParser_ONNX.cxx b/src/SOFIE_parsers/src/RModelParser_ONNX.cxx
index 68662ae..5924836 100644
--- a/src/SOFIE_parsers/src/RModelParser_ONNX.cxx
+++ b/src/SOFIE_parsers/src/RModelParser_ONNX.cxx
@@ -73,6 +73,7 @@ extern ParserFuncSignature ParseShape;
 extern ParserFuncSignature ParseMatMul;
 extern ParserFuncSignature ParseLayerNormalization;
 extern ParserFuncSignature ParseGather;
+extern ParserFuncSignature ParseGatherND;
 extern ParserFuncSignature ParseErf;
 extern ParserFuncSignature ParseElu;
 extern ParserFuncSignature ParseEyeLike;
@@ -134,6 +135,7 @@ struct ExtractDataFromTP<int64_t> {
 };
 template<typename T>
 std::shared_ptr<void> GetInitializedTensorData(onnx::TensorProto * tensorproto, size_t length) {
+   std::cout<<"Getting Initialized Tensor data for tensor " << tensorproto->name() << " of type " << tensorproto->data_type() << " and length " << length << std::endl;
    std::shared_ptr<void> data(malloc(length * sizeof(T)), free);
 
    if (!tensorproto->raw_data().empty()) {
@@ -217,6 +219,7 @@ RModelParser_ONNX::RModelParser_ONNX() noexcept : fOperatorsMapImpl(std::make_un
    RegisterOperator("LayerNormalization", ParseLayerNormalization);
    RegisterOperator("Expand", ParseExpand);
    RegisterOperator("Gather", ParseGather);
+   RegisterOperator("GatherND", ParseGatherND);
    RegisterOperator("Erf", ParseErf);
    RegisterOperator("Elu", ParseElu);
    RegisterOperator("EyeLike", ParseEyeLike);
@@ -584,6 +587,13 @@ void RModelParser_ONNX::ParseONNXGraph(RModel & rmodel, const onnx::GraphProto &
          if (verbose) std::cout << "add INT64 initialized tensor " << input_name << " shape " << ConvertShapeToString(shape) << std::endl;
          rmodel.AddInitializedTensor(input_name, ETensorType::INT64, shape, data);
          allInitializedTensors[input_name] = i;
+         std::cout<<"Printing initialized values for tensor: "<<input_name;
+         int64_t* rawData = static_cast<int64_t*>(data.get());
+
+         for (size_t i = 0; i < fLength; ++i) {
+            std::cout << rawData[i] << " ";
+         }
+         std::cout << std::endl;
          break;
       }
       default:
diff --git a/src/utils/CMakeLists.txt b/src/utils/CMakeLists.txt
new file mode 100644
index 0000000..2ede060
--- /dev/null
+++ b/src/utils/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_library(utils INTERFACE)
+
+target_include_directories(utils INTERFACE
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:include>
+)
+
+install(
+    DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/SOFIE
+    DESTINATION include
+)
diff --git a/src/utils/SOFIE/RTensor.hxx b/src/utils/SOFIE/RTensor.hxx
new file mode 100644
index 0000000..db82dc9
--- /dev/null
+++ b/src/utils/SOFIE/RTensor.hxx
@@ -0,0 +1,628 @@
+#ifndef SOFIE_RTENSOR
+#define SOFIE_RTENSOR
+
+#include <vector>
+#include <cstddef>     // std::size_t
+#include <cstdint>
+#include <stdexcept>   // std::runtime_error
+#include <sstream>     // std::stringstream
+#include <memory>      // std::shared_ptr
+#include <type_traits> // std::is_convertible
+#include <algorithm>   // std::reverse
+#include <iterator>    // std::random_access_iterator_tag
+
+namespace SOFIE {
+
+/// Memory layout type
+enum class MemoryLayout : uint8_t {
+   RowMajor = 0x01,
+   ColumnMajor = 0x02
+};
+
+namespace Internal {
+
+/// \brief Get size of tensor from shape vector
+/// \param[in] shape Shape vector
+/// \return Size of contiguous memory
+template <typename T>
+inline std::size_t GetSizeFromShape(const T &shape)
+{
+   if (shape.size() == 0)
+      return 0;
+   std::size_t size = 1;
+   for (auto &s : shape)
+      size *= s;
+   return size;
+}
+
+/// \brief Compute strides from shape vector.
+/// \param[in] shape Shape vector
+/// \param[in] layout Memory layout
+/// \return Size of contiguous memory
+///
+/// This information is needed for the multi-dimensional indexing. See here:
+/// https://en.wikipedia.org/wiki/Row-_and_column-major_order
+/// https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.strides.html
+template <typename T>
+inline std::vector<std::size_t> ComputeStridesFromShape(const T &shape, MemoryLayout layout)
+{
+   const auto size = shape.size();
+   T strides(size);
+   if (layout == MemoryLayout::RowMajor) {
+      for (std::size_t i = 0; i < size; i++) {
+         if (i == 0) {
+            strides[size - 1 - i] = 1;
+         } else {
+            strides[size - 1 - i] = strides[size - 1 - i + 1] * shape[size - 1 - i + 1];
+         }
+      }
+   } else if (layout == MemoryLayout::ColumnMajor) {
+      for (std::size_t i = 0; i < size; i++) {
+         if (i == 0) {
+            strides[i] = 1;
+         } else {
+            strides[i] = strides[i - 1] * shape[i - 1];
+         }
+      }
+   } else {
+      std::stringstream ss;
+      ss << "Memory layout type is not valid for calculating strides.";
+      throw std::runtime_error(ss.str());
+   }
+   return strides;
+}
+
+/// \brief Compute indices from global index
+/// \param[in] shape Shape vector
+/// \param[in] idx Global index
+/// \param[in] layout Memory layout
+/// \return Indice vector
+template <typename T>
+inline T ComputeIndicesFromGlobalIndex(const T& shape, MemoryLayout layout, const typename T::value_type idx)
+{
+    const auto size = shape.size();
+    auto strides = ComputeStridesFromShape(shape, layout);
+    T indices(size);
+    auto r = idx;
+    for (std::size_t i = 0; i < size; i++) {
+        indices[i] = int(r / strides[i]);
+        r = r % strides[i];
+    }
+    return indices;
+}
+
+/// \brief Compute global index from indices
+/// \param[in] strides Strides vector
+/// \param[in] idx Indice vector
+/// \return Global index
+template <typename U, typename V>
+inline std::size_t ComputeGlobalIndex(const U& strides, const V& idx)
+{
+   std::size_t globalIndex = 0;
+   const auto size = idx.size();
+   for (std::size_t i = 0; i < size; i++) {
+      globalIndex += strides[size - 1 - i] * idx[size - 1 - i];
+   }
+   return globalIndex;
+}
+
+/// \brief Type checking for all types of a parameter pack, e.g., used in combination with std::is_convertible
+template <class... Ts>
+struct and_types : std::true_type {
+};
+
+template <class T0, class... Ts>
+struct and_types<T0, Ts...> : std::integral_constant<bool, T0() && and_types<Ts...>()> {
+};
+
+/// \brief Copy slice of a tensor recursively from here to there
+/// \param[in] here Source tensor
+/// \param[in] there Target tensor (slice of source tensor)
+/// \param[in] mins Minimum of indices for each dimension
+/// \param[in] maxs Maximum of indices for each dimension
+/// \param[in] idx Current indices
+/// \param[in] active Active index needed to stop the recursion
+///
+/// Copy the content of a slice of a tensor from source to target. This is done
+/// by recursively iterating over the ranges of the slice for each dimension.
+template <typename T>
+void RecursiveCopy(const T &here, T &there,
+                   const std::vector<std::size_t> &mins, const std::vector<std::size_t> &maxs,
+                   std::vector<std::size_t> idx, std::size_t active)
+{
+   const auto size = idx.size();
+   for (std::size_t i = mins[active]; i < maxs[active]; i++) {
+      idx[active] = i;
+      if (active == size - 1) {
+         auto idxThere = idx;
+         for (std::size_t j = 0; j < size; j++) {
+            idxThere[j] -= mins[j];
+         }
+         there(idxThere) = here(idx);
+      } else {
+         Internal::RecursiveCopy(here, there, mins, maxs, idx, active + 1);
+      }
+   }
+}
+
+} // namespace SOFIE::Internal
+
+/// \class SOFIE::RTensor
+/// \brief RTensor is a container with contiguous memory and shape information.
+/// \tparam T Data-type of the tensor
+///
+/// An RTensor is a vector-like container, which has additional shape information.
+/// The elements of the multi-dimensional container can be accessed by their
+/// indices in a coherent way without taking care about the one-dimensional memory
+/// layout of the contiguous storage. This also allows to manipulate the shape
+/// of the container without moving the actual elements in memory. Another feature
+/// is that an RTensor can own the underlying contiguous memory but can also represent
+/// only a view on existing data without owning it.
+template <typename V, typename C = std::vector<V>>
+class RTensor {
+public:
+   // Typedefs
+   using Value_t = V;
+   using Shape_t = std::vector<std::size_t>;
+   using Index_t = Shape_t;
+   using Slice_t = std::vector<Shape_t>;
+   using Container_t = C;
+
+private:
+   Shape_t fShape;
+   Shape_t fStrides;
+   std::size_t fSize;
+   MemoryLayout fLayout;
+   Value_t *fData;
+   std::shared_ptr<Container_t> fContainer;
+
+protected:
+   void ReshapeInplace(const Shape_t &shape);
+
+public:
+   // Constructors
+
+   /// \brief Construct a tensor as view on data
+   /// \param[in] data Pointer to data contiguous in memory
+   /// \param[in] shape Shape vector
+   /// \param[in] layout Memory layout
+   RTensor(Value_t *data, Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fLayout(layout), fData(data), fContainer(nullptr)
+   {
+      fSize = Internal::GetSizeFromShape(shape);
+      fStrides = Internal::ComputeStridesFromShape(shape, layout);
+   }
+
+   /// \brief Construct a tensor as view on data
+   /// \param[in] data Pointer to data contiguous in memory
+   /// \param[in] shape Shape vector
+   /// \param[in] strides Strides vector
+   /// \param[in] layout Memory layout
+   RTensor(Value_t *data, Shape_t shape, Shape_t strides, MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fStrides(strides), fLayout(layout), fData(data), fContainer(nullptr)
+   {
+      fSize = Internal::GetSizeFromShape(shape);
+   }
+
+   /// \brief Construct a tensor owning externally provided data
+   /// \param[in] container Shared pointer to data container
+   /// \param[in] shape Shape vector
+   /// \param[in] layout Memory layout
+   RTensor(std::shared_ptr<Container_t> container, Shape_t shape,
+           MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fLayout(layout), fContainer(container)
+   {
+      fSize = Internal::GetSizeFromShape(shape);
+      fStrides = Internal::ComputeStridesFromShape(shape, layout);
+      fData = std::data(*fContainer);
+   }
+
+   /// \brief Construct a tensor owning data initialized with new container
+   /// \param[in] shape Shape vector
+   /// \param[in] layout Memory layout
+   RTensor(Shape_t shape, MemoryLayout layout = MemoryLayout::RowMajor)
+      : fShape(shape), fLayout(layout)
+   {
+      // TODO: Document how data pointer is determined using STL iterator interface.
+      // TODO: Sanitize given container type with type traits
+      fSize = Internal::GetSizeFromShape(shape);
+      fStrides = Internal::ComputeStridesFromShape(shape, layout);
+      fContainer = std::make_shared<Container_t>(fSize);
+      fData = std::data(*fContainer);
+   }
+
+   // Access elements
+   Value_t &operator()(const Index_t &idx);
+   const Value_t &operator() (const Index_t &idx) const;
+   template <typename... Idx> Value_t &operator()(Idx... idx);
+   template <typename... Idx> const Value_t &operator() (Idx... idx) const;
+
+   // Access properties
+   std::size_t GetSize() const { return fSize; }
+   const Shape_t &GetShape() const { return fShape; }
+   const Shape_t &GetStrides() const { return fStrides; }
+   Value_t *GetData() { return fData; }
+   const Value_t *GetData() const { return fData; }
+   std::shared_ptr<Container_t> GetContainer() { return fContainer; }
+   const std::shared_ptr<Container_t> GetContainer() const { return fContainer; }
+   MemoryLayout GetMemoryLayout() const { return fLayout; }
+   bool IsView() const { return fContainer == nullptr; }
+   bool IsOwner() const { return !IsView(); }
+
+   // Copy
+   RTensor<Value_t, Container_t> Copy(MemoryLayout layout = MemoryLayout::RowMajor) const;
+
+   // Transformations
+   RTensor<Value_t, Container_t> Transpose() const;
+   RTensor<Value_t, Container_t> Squeeze() const;
+   RTensor<Value_t, Container_t> ExpandDims(int idx) const;
+   RTensor<Value_t, Container_t> Reshape(const Shape_t &shape) const;
+   RTensor<Value_t, Container_t> Resize(const Shape_t &shape);
+   RTensor<Value_t, Container_t> Slice(const Slice_t &slice);
+
+   // Iterator class
+   class Iterator {
+   private:
+      RTensor<Value_t, Container_t>& fTensor;
+      Index_t::value_type fGlobalIndex;
+   public:
+      using iterator_category = std::random_access_iterator_tag;
+      using value_type = Value_t;
+      using difference_type = std::ptrdiff_t;
+      using pointer = Value_t *;
+      using reference = Value_t &;
+
+      Iterator(RTensor<Value_t, Container_t>& x, typename Index_t::value_type idx) : fTensor(x), fGlobalIndex(idx) {}
+      Iterator& operator++() { fGlobalIndex++; return *this; }
+      Iterator operator++(int) { auto tmp = *this; operator++(); return tmp; }
+      Iterator& operator--() { fGlobalIndex--; return *this; }
+      Iterator operator--(int) { auto tmp = *this; operator--(); return tmp; }
+      Iterator operator+(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex + rhs); }
+      Iterator operator-(difference_type rhs) const { return Iterator(fTensor, fGlobalIndex - rhs); }
+      difference_type operator-(const Iterator& rhs) { return fGlobalIndex - rhs.GetGlobalIndex(); }
+      Iterator& operator+=(difference_type rhs) { fGlobalIndex += rhs; return *this; }
+      Iterator& operator-=(difference_type rhs) { fGlobalIndex -= rhs; return *this; }
+      Value_t& operator*()
+      {
+         auto idx = Internal::ComputeIndicesFromGlobalIndex(fTensor.GetShape(), fTensor.GetMemoryLayout(), fGlobalIndex);
+         return fTensor(idx);
+      }
+      bool operator==(const Iterator& rhs) const
+      {
+         if (fGlobalIndex == rhs.GetGlobalIndex()) return true;
+         return false;
+      }
+      bool operator!=(const Iterator& rhs) const { return !operator==(rhs); };
+      bool operator>(const Iterator& rhs) const { return fGlobalIndex > rhs.GetGlobalIndex(); }
+      bool operator<(const Iterator& rhs) const { return fGlobalIndex < rhs.GetGlobalIndex(); }
+      bool operator>=(const Iterator& rhs) const { return fGlobalIndex >= rhs.GetGlobalIndex(); }
+      bool operator<=(const Iterator& rhs) const { return fGlobalIndex <= rhs.GetGlobalIndex(); }
+      typename Index_t::value_type GetGlobalIndex() const { return fGlobalIndex; };
+   };
+
+   // Iterator interface
+   // TODO: Document that the iterator always iterates following the physical memory layout.
+   Iterator begin() noexcept {
+      return Iterator(*this, 0);
+   }
+   Iterator end() noexcept {
+      return Iterator(*this, fSize);
+   }
+};
+
+/// \brief Reshape tensor in place
+/// \param[in] shape Shape vector
+/// Reshape tensor without changing the overall size
+template <typename Value_t, typename Container_t>
+inline void RTensor<Value_t, Container_t>::ReshapeInplace(const Shape_t &shape)
+{
+   const auto size = Internal::GetSizeFromShape(shape);
+   if (size != fSize) {
+      std::stringstream ss;
+      ss << "Cannot reshape tensor with size " << fSize << " into shape { ";
+      for (std::size_t i = 0; i < shape.size(); i++) {
+         if (i != shape.size() - 1) {
+            ss << shape[i] << ", ";
+         } else {
+            ss << shape[i] << " }.";
+         }
+      }
+      throw std::runtime_error(ss.str());
+   }
+
+   // Compute new strides from shape
+   auto strides = Internal::ComputeStridesFromShape(shape, fLayout);
+   fShape = shape;
+   fStrides = strides;
+}
+
+
+/// \brief Access elements
+/// \param[in] idx Index vector
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+inline Value_t &RTensor<Value_t, Container_t>::operator()(const Index_t &idx)
+{
+   const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx);
+   return fData[globalIndex];
+}
+
+/// \brief Access elements
+/// \param[in] idx Index vector
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+inline const Value_t &RTensor<Value_t, Container_t>::operator() (const Index_t &idx) const
+{
+   const auto globalIndex = Internal::ComputeGlobalIndex(fStrides, idx);
+   return fData[globalIndex];
+}
+
+/// \brief Access elements
+/// \param[in] idx Indices
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+template <typename... Idx>
+Value_t &RTensor<Value_t, Container_t>::operator()(Idx... idx)
+{
+   static_assert(Internal::and_types<std::is_convertible<Idx, std::size_t>...>{},
+                 "Indices are not convertible to std::size_t.");
+   return operator()({static_cast<std::size_t>(idx)...});
+}
+
+/// \brief Access elements
+/// \param[in] idx Indices
+/// \return Reference to element
+template <typename Value_t, typename Container_t>
+template <typename... Idx>
+const Value_t &RTensor<Value_t, Container_t>::operator() (Idx... idx) const
+{
+   static_assert(Internal::and_types<std::is_convertible<Idx, std::size_t>...>{},
+                 "Indices are not convertible to std::size_t.");
+   return operator()({static_cast<std::size_t>(idx)...});
+}
+
+/// \brief Transpose
+/// \returns New RTensor
+/// The tensor is transposed by inverting the associated memory layout from row-
+/// major to column-major and vice versa. Therefore, the underlying data is not
+/// touched.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Transpose() const
+{
+   MemoryLayout layout;
+   // Transpose by inverting memory layout
+   if (fLayout == MemoryLayout::RowMajor) {
+      layout = MemoryLayout::ColumnMajor;
+   } else if (fLayout == MemoryLayout::ColumnMajor) {
+      layout = MemoryLayout::RowMajor;
+   } else {
+      throw std::runtime_error("Memory layout is not known.");
+   }
+
+   // Create copy of container
+   RTensor<Value_t, Container_t> x(fData, fShape, fStrides, layout);
+
+   // Reverse shape
+   std::reverse(x.fShape.begin(), x.fShape.end());
+
+   // Reverse strides
+   std::reverse(x.fStrides.begin(), x.fStrides.end());
+
+   return x;
+}
+
+/// \brief Squeeze dimensions
+/// \returns New RTensor
+/// Squeeze removes the dimensions of size one from the shape.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Squeeze() const
+{
+   // Remove dimensions of one and associated strides
+   Shape_t shape;
+   Shape_t strides;
+   for (std::size_t i = 0; i < fShape.size(); i++) {
+      if (fShape[i] != 1) {
+         shape.emplace_back(fShape[i]);
+         strides.emplace_back(fStrides[i]);
+      }
+   }
+
+   // If all dimensions are 1, we need to keep one.
+   // This does not apply if the inital shape is already empty. Then, return
+   // the empty shape.
+   if (shape.size() == 0 && fShape.size() != 0) {
+      shape.emplace_back(1);
+      strides.emplace_back(1);
+   }
+
+   // Create copy, attach new shape and strides and return
+   RTensor<Value_t, Container_t> x(*this);
+   x.fShape = shape;
+   x.fStrides = strides;
+   return x;
+}
+
+/// \brief Expand dimensions
+/// \param[in] idx Index in shape vector where dimension is added
+/// \returns New RTensor
+/// Inserts a dimension of one into the shape.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::ExpandDims(int idx) const
+{
+   // Compose shape vector with additional dimensions and adjust strides
+   const int len = fShape.size();
+   auto shape = fShape;
+   auto strides = fStrides;
+   if (idx < 0) {
+      idx = len + 1 + idx;
+   }
+   if (idx < 0) {
+      throw std::runtime_error("Given negative index is invalid.");
+   }
+   else if (idx > len) {
+      throw std::runtime_error("Given index is invalid.");
+   }
+   shape.insert(shape.begin() + idx, 1);
+   strides = Internal::ComputeStridesFromShape(shape, fLayout);
+
+   // Create view copy, attach new shape and strides and return
+   RTensor<Value_t, Container_t> x(*this);
+   x.fShape = shape;
+   x.fStrides = strides;
+   return x;
+}
+
+/// \brief Reshape tensor
+/// \param[in] shape Shape vector
+/// \returns New RTensor
+/// Reshape tensor without changing the overall size
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Reshape(const Shape_t &shape) const
+{
+   // Create copy, replace and return
+   RTensor<Value_t, Container_t> x(*this);
+   x.ReshapeInplace(shape);
+   return x;
+}
+
+/// \brief Resize tensor
+/// \param[in] shape Shape vector
+/// \returns New RTensor
+/// Resize tensor into new shape
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Resize(const Shape_t &shape)
+{
+   // Create new tensor with the specified shape
+   RTensor <Value_t, Container_t> x(shape, fLayout);
+
+   // Copying contents from previous tensor
+   size_t n = (x.GetSize()>fSize) ? fSize : x.GetSize();
+   std::copy(this->GetData(), this->GetData() + n, x.GetData() );
+
+   return x;
+}
+
+/// \brief Create a slice of the tensor
+/// \param[in] slice Slice vector
+/// \returns New RTensor
+/// A slice is a subset of the tensor defined by a vector of pairs of indices.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Slice(const Slice_t &slice)
+{
+   // Sanitize size of slice
+   const auto sliceSize = slice.size();
+   const auto shapeSize = fShape.size();
+   if (sliceSize != shapeSize) {
+      std::stringstream ss;
+      ss << "Size of slice (" << sliceSize << ") is unequal number of dimensions (" << shapeSize << ").";
+      throw std::runtime_error(ss.str());
+   }
+
+   // Sanitize slice indices
+   // TODO: Sanitize slice indices
+   /*
+   for (std::size_t i = 0; i < sliceSize; i++) {
+   }
+   */
+
+   // Convert -1 in slice to proper pair of indices
+   // TODO
+
+   // Recompute shape and size
+   Shape_t shape(sliceSize);
+   for (std::size_t i = 0; i < sliceSize; i++) {
+      shape[i] = slice[i][1] - slice[i][0];
+   }
+   auto size = Internal::GetSizeFromShape(shape);
+
+   // Determine first element contributing to the slice and get the data pointer
+   Value_t *data;
+   Shape_t idx(sliceSize);
+   for (std::size_t i = 0; i < sliceSize; i++) {
+      idx[i] = slice[i][0];
+   }
+   data = &operator()(idx);
+
+   // Create copy and modify properties
+   RTensor<Value_t, Container_t> x(*this);
+   x.fData = data;
+   x.fShape = shape;
+   x.fSize = size;
+
+   // Squeeze tensor and return
+   return x.Squeeze();
+}
+
+/// Copy RTensor to new object
+/// \param[in] layout Memory layout of the new RTensor
+/// \returns New RTensor
+/// The operation copies all elements of the current RTensor to a new RTensor
+/// with the given layout contiguous in memory. Note that this copies by default
+/// to a row major memory layout.
+template <typename Value_t, typename Container_t>
+inline RTensor<Value_t, Container_t> RTensor<Value_t, Container_t>::Copy(MemoryLayout layout) const
+{
+   // Create new tensor with zeros owning the memory
+   RTensor<Value_t, Container_t> r(fShape, layout);
+
+   // Copy over the elements from this tensor
+   const auto mins = Shape_t(fShape.size());
+   const auto maxs = fShape;
+   auto idx = mins;
+   Internal::RecursiveCopy(*this, r, mins, maxs, idx, 0);
+
+   return r;
+}
+
+/// \brief Pretty printing
+/// \param[in] os Output stream
+/// \param[in] x RTensor
+/// \return Modified output stream
+template <typename T>
+std::ostream &operator<<(std::ostream &os, RTensor<T> &x)
+{
+   const auto shapeSize = x.GetShape().size();
+   if (shapeSize == 1) {
+      os << "{ ";
+      const auto size = x.GetSize();
+      for (std::size_t i = 0; i < size; i++) {
+         os << x({i});
+         if (i != size - 1)
+            os << ", ";
+      }
+      os << " }";
+   } else if (shapeSize == 2) {
+      os << "{";
+      const auto shape = x.GetShape();
+      for (std::size_t i = 0; i < shape[0]; i++) {
+         os << " { ";
+         for (std::size_t j = 0; j < shape[1]; j++) {
+            os << x({i, j});
+            if (j < shape[1] - 1) {
+               os << ", ";
+            } else {
+               os << " ";
+            }
+         }
+         os << "}";
+      }
+      os << " }";
+   } else {
+      os << "{ printing not yet implemented for this rank }";
+   }
+   return os;
+}
+   
+} // namespace SOFIE
+
+namespace cling {
+template <typename T>
+std::string printValue(SOFIE::RTensor<T> *x)
+{
+   std::stringstream ss;
+   ss << *x;
+   return ss.str();
+}
+} // namespace cling
+
+#endif // SOFIE_RTENSOR