diff --git a/.github/workflows/BuildDocker.yml b/.github/workflows/BuildDocker.yml
index 01dd15ef32..6e0ac03a3c 100644
--- a/.github/workflows/BuildDocker.yml
+++ b/.github/workflows/BuildDocker.yml
@@ -38,4 +38,4 @@ jobs:
           file: Container/Dockerfile
           push: true
           # JUNGVI: If you operate from a fork and want to build a new docker make sure to replace 'pulp-platform' by your uname. 
-          tags: ghcr.io/pulp-platform/deeploy:main
+          tags: ghcr.io/runwangdl/deeploy:redmule
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 7d355c822b..6ed9866c5f 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -9,7 +9,7 @@ on:
     - cron: "0 1 */6 * *"
 
 env:
-  DOCKER_IMAGE: ghcr.io/pulp-platform/deeploy:main
+  DOCKER_IMAGE: ghcr.io/runwangdl/deeploy:redmule
 
 jobs:
 
@@ -338,7 +338,7 @@ jobs:
           },
           {
             "name": "testFloat2DConvolution",
-            "L1": [2000]
+            "L1": [8000]
           },
           {
             "name": "testFloatLayerNorm",
@@ -420,7 +420,7 @@ jobs:
           },
           {
             "name": "testFloat2DConvolution",
-            "L1": [4000]
+            "L1": [15000]
           },
           {
             "name": "testFloatLayerNorm",
@@ -514,12 +514,8 @@ jobs:
             L1: [64000]
           - name: "CCT/CCT_1_16_16_64"
             L1: [64000]
-          - name: "CCT/CCT_1_16_16_128"
-            L1: [64000]
           - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_64"
             L1: [64000]
-          - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_128"
-            L1: [64000]
         num-cores:
           - 8
         default-memory-level:
@@ -559,12 +555,8 @@ jobs:
             L1: [64000]
           - name: "CCT/CCT_1_16_16_64"
             L1: [64000]
-          - name: "CCT/CCT_1_16_16_128"
-            L1: [64000]
           - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_64"
             L1: [64000]
-          - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_128"
-            L1: [64000]
         num-cores:
           - 8
         double-buffer:
@@ -748,6 +740,42 @@ jobs:
       default-memory-level: ${{ matrix.default-memory-level }}
       neureka-wmem: ${{ matrix.neureka-wmem }}
 
+  siracusa-redmule-kernels-tiled-singlebuffer-L2:
+    strategy:
+      fail-fast: false
+      matrix:
+        test-data: 
+          - name: "testFloatMatmul"
+            L1: [8000]
+        num-cores:
+          - 8
+    uses: ./.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml
+    needs: select-docker-image
+    with:
+      docker-image: ${{ needs.select-docker-image.outputs.image }}
+      test-name: ${{ matrix.test-data.name }}
+      num-cores: ${{ matrix.num-cores }}
+      L1: ${{ toJson(matrix.test-data.L1) }}
+
+  siracusa-redmule-kernels-tiled-doublebuffer-L2:
+    strategy:
+      fail-fast: false
+      matrix:
+        test-data: 
+          - name: "testFloatMatmul"
+            L1: [8000]
+        num-cores:
+          - 8
+        double-buffer:
+          - true
+    uses: ./.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml
+    needs: select-docker-image
+    with:
+      docker-image: ${{ needs.select-docker-image.outputs.image }}
+      test-name: ${{ matrix.test-data.name }}
+      num-cores: ${{ matrix.num-cores }}
+      L1: ${{ toJson(matrix.test-data.L1) }}
+      double-buffer: ${{ matrix.double-buffer }}
 
   ### Deeploy Extension and Internal Tests ###
   deeploy-memory-allocation:
diff --git a/.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml b/.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml
new file mode 100644
index 0000000000..e12c4b675c
--- /dev/null
+++ b/.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml
@@ -0,0 +1,72 @@
+name: TestRunnerTiledSiracusa
+
+on:
+  workflow_call:
+    inputs:
+      docker-image:
+        required: true
+        type: string
+      test-name:
+        required: true
+        type: string
+      num-cores:
+        required: false
+        default: 8
+        type: number
+      L1:
+        required: false
+        default: "[64000]"
+        type: string
+      default-memory-level:
+        required: false
+        default: "L2"
+        type: string
+      double-buffer:
+        required: false
+        default: false
+        type: boolean
+      memory-allocation-strategy:
+        required: false
+        default: "MiniMalloc"
+        type: string
+      search-strategy:
+        required: false
+        default: "random-max"
+        type: string
+
+jobs:
+
+  test-runner-siracusa-tiled:
+    strategy:
+      fail-fast: false
+      matrix:
+        L1: ${{ fromJSON(inputs.L1) }}
+    runs-on: ubuntu-22.04
+    container:
+      image: ${{ inputs.docker-image }}
+    steps:
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        run: pip install -e .
+      - name: Cache ccache
+        id: ccache-cache
+        uses: actions/cache@v4
+        with:
+          path: /app/.ccache
+          key: ${{ runner.os }}-ccache
+      - name: Run Test
+        uses: nick-fields/retry@v3
+        with:
+          timeout_minutes: 15
+          max_attempts: 3
+          retry_on: timeout
+          command: |
+            cd DeeployTest
+            mkdir -p /app/.ccache
+            export CCACHE_DIR=/app/.ccache
+            python testRunner_tiled_siracusa_w_redmule.py -t Tests/${{ inputs.test-name }} --cores=${{ inputs.num-cores }} --l1 ${{ matrix.L1 }} --defaultMemLevel=${{ inputs.default-memory-level }} ${{ inputs.double-buffer && '--doublebuffer' || '' }} --memAllocStrategy=${{ inputs.memory-allocation-strategy }} --searchStrategy=${{ inputs.search-strategy }}
+          shell: bash
+        
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index f14ed74461..6f8381d29b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -282,4 +282,18 @@ Change main.c to use OUTPUTTYPE instead of float
 
 ### Changed
 - The ISA for the Siracusa platform has been updated from rv32imc_zfinx_xpulpv2 to rv32imf_xpulpv2.
-- All floating-point comparison tasks in deeploytest.c are now offloaded to Cluster 0 for execution.
\ No newline at end of file
+- All floating-point comparison tasks in deeploytest.c are now offloaded to Cluster 0 for execution.
+
+## Add RV32IMF Picolibc support for Siracusa platform
+
+## Added
+- Adds RV32IMF Picolib to the toolchain
+
+## Parallelization and Optimization of CCT Inference and Training Kernels
+
+### Added
+- Parallel Matmul, Softmax, Gelu, Conv, Layernorm, Maxpool, Add
+- Gelu with sigmoid approximation
+- Im2col Conv
+- Matmul with pulptrainlib with 1*7 unrolling performance aligned with pulptrainlib
+- Compute op support for multiple float kernels: Maxpool, Relu, Mul
diff --git a/CMakeLists.txt b/CMakeLists.txt
index b23293dd55..dbdfb86409 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -15,8 +15,8 @@ if(TOOLCHAIN STREQUAL GCC)
   set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()
 
-set(platform MemPool CACHE STRING "Platform (MemPool, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic, Snitch)")
-set_property(CACHE platform PROPERTY STRINGS MemPool QEMU Siracusa Siracusa_w_neureka PULP-Open Generic Snitch)
+set(platform MemPool CACHE STRING "Platform (MemPool, QEMU, Siracusa, Siracusa_w_neureka, Siracusa_w_redmule, PULP-Open, Generic, Snitch)")
+set_property(CACHE platform PROPERTY STRINGS MemPool QEMU Siracusa Siracusa_w_neureka Siracusa_w_redmule PULP-Open Generic Snitch)
 
 if(platform STREQUAL MemPool)
   message(STATUS "Building for platform 'MemPool'")
@@ -26,6 +26,8 @@ elseif(platform STREQUAL Siracusa)
   message(STATUS "Building for platform 'Siracusa'")
 elseif(platform STREQUAL Siracusa_w_neureka)
   message(STATUS "Building for platform 'Siracusa_w_neureka'")
+elseif(platform STREQUAL Siracusa_w_redmule)
+  message(STATUS "Building for platform 'Siracusa_w_redmule'")
 elseif(platform STREQUAL PULPOpen)
   message(STATUS "Building for platform 'PULP-Open'")
 elseif(platform STREQUAL Generic)
@@ -148,7 +150,7 @@ if(platform STREQUAL QEMU-ARM)
 
 endif()
 
-if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL PULPOpen)
+if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule OR platform STREQUAL PULPOpen)
 
   if(TOOLCHAIN STREQUAL LLVM)
     set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/toolchain_llvm.cmake)
@@ -158,7 +160,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor
 
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp.cmake)
 
-  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka)
+  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule)
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/siracusa/siracusa.cmake)
   elseif(platform STREQUAL PULPOpen)
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp-open/pulp-open.cmake)
diff --git a/Container/Dockerfile b/Container/Dockerfile
index ce77db92ad..2d0a78c78f 100644
--- a/Container/Dockerfile
+++ b/Container/Dockerfile
@@ -42,7 +42,9 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y git-lfs \
     libsdl2-ttf-dev \
     gcc-multilib \
     wget \
-    clang-format
+    clang-format \
+    libxtensor-dev \
+    libxsimd-dev
 
 # Install cmake 3.31.1
 RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1-linux-x86_64.sh && \
diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
index b7249c5e83..5fe3c389bd 100644
--- a/Deeploy/Targets/Generic/Layers.py
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -69,15 +69,16 @@ def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
     def computeOps(self):
-        compAbs = self.mapper.parser.operatorRepresentation['size']
-        compAdd = self.mapper.parser.operatorRepresentation['size']
-        compSqr = self.mapper.parser.operatorRepresentation['size']
-        compMul = self.mapper.parser.operatorRepresentation['size']
-        compAdd = self.mapper.parser.operatorRepresentation['size']
-        compMul2 = self.mapper.parser.operatorRepresentation['size']
-        compAdd2 = self.mapper.parser.operatorRepresentation['size']
-        compDiv = self.mapper.parser.operatorRepresentation['size']
-        return compAbs + compAdd + compSqr + compMul + compAdd + compMul2 + compAdd2 + compDiv
+        size = self.mapper.parser.operatorRepresentation['size']
+        # RW: Sigmoid approximation
+        mul1 = size  # Multiply by 1.702
+        neg = size  # Negate the result
+        exp = size  # Compute exponential
+        add = size  # Add 1
+        div = size  # Division for sigmoid
+        mul2 = size  # Final multiplication by x
+
+        return mul1 + neg + exp + add + div + mul2
 
 
 class iHardswishLayer(ONNXLayer):
@@ -120,12 +121,39 @@ class SoftmaxLayer(ONNXLayer):
     def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
+    def computeOps(self):
+
+        size = self.mapper.parser.operatorRepresentation['size']
+        last_dim_length = self.mapper.parser.operatorRepresentation['lastDimLength']
+        batch_size = size // last_dim_length
+
+        max_ops = last_dim_length - 1
+        exp_ops = last_dim_length * 2
+        sum_ops = last_dim_length - 1
+        div_ops = last_dim_length
+        ops_per_batch = max_ops + exp_ops + sum_ops + div_ops
+        total_ops = ops_per_batch * batch_size
+
+        return total_ops
+
 
 class SoftmaxGradLayer(ONNXLayer):
 
     def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
+    def computeOps(self):
+        input_size = self.mapper.parser.operatorRepresentation['size']
+
+        # SoftmaxGrad operation: dy * (y - (y * sum(dy * y)))
+        mul_ops = input_size
+        sum_ops = input_size
+        broadcast_mul_ops = input_size
+        sub_ops = input_size
+        final_mul_ops = input_size
+
+        return mul_ops + sum_ops + broadcast_mul_ops + sub_ops + final_mul_ops
+
 
 class ITAMaxLayer(ONNXLayer):
 
@@ -252,7 +280,7 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese
             N = inputShapes[1][-1]
 
         if len(inputShapes) == 3:
-            inputShapes[2] = [M, N]
+            inputShapes[2] = outputShapes[0]
 
         return (inputShapes, outputShapes)
 
@@ -317,6 +345,9 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese
             inputShapes[0] = inputShapes[1]
         return (inputShapes, outputShapes)
 
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
 
 class ConvLayer(ONNXLayer):
 
@@ -374,6 +405,14 @@ class MaxPoolLayer(ONNXLayer):
     def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
+    def computeOps(self):
+        kernel_shape = self.mapper.parser.operatorRepresentation['kernel_shape']
+        elements_per_window = int(np.prod(kernel_shape))
+        data_out_size = self.mapper.parser.operatorRepresentation['data_out_size']
+        comparisons_per_window = elements_per_window - 1
+        total_ops = data_out_size * comparisons_per_window
+        return total_ops
+
 
 class ReduceMeanLayer(ONNXLayer):
 
@@ -403,6 +442,9 @@ class ReluLayer(ONNXLayer):
     def __init__(self, maps: List[NodeMapper]):
         super().__init__(maps)
 
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
 
 class LayerNormLayer(ONNXLayer):
 
diff --git a/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py b/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py
index 711436b7a1..7b011d76d5 100644
--- a/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py
@@ -1,12 +1,12 @@
 # ----------------------------------------------------------------------
 #
-# File: iGELUTemplate.py
+# File: FloatGELUTemplate.py
 #
-# Last edited: 13.12.2021
+# Last edited: 28.03.2025
 #
 # Copyright (C) 2021, ETH Zurich and University of Bologna.
 #
-# Author: Moritz Scherer, ETH Zurich
+# Author: Run Wang, ETH Zurich
 #
 # ----------------------------------------------------------------------
 # SPDX-License-Identifier: Apache-2.0
@@ -28,4 +28,4 @@
 referenceTemplate = NodeTemplate("""
 // GELU (Name: ${nodeName}, Op: ${nodeOp})
 SINGLE_CORE GELU_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size});
-""")
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index 8fc4d9d97b..a8ba45ed46 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -37,23 +37,23 @@
 from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
-from Deeploy.Targets.Generic.Templates import ConcatTemplate, DequantTemplate, FloatGELUTemplate, FloatGemmTemplate, \
-    FloatLayernormTemplate, FloatMatMulTemplate, FloatMulTemplate, FloatReduceSumTemplate, FloatReluTemplate, \
-    FloatSoftmaxTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, iHardswishTemplate
-from Deeploy.Targets.Generic.TypeCheckers import ConcatChecker, ConvChecker, DequantChecker, GatherChecker, \
-    GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, QuantChecker, \
-    ReduceMeanChecker, ReluChecker, RQAddChecker, RQHardswishChecker, SGDChecker, SliceChecker, SoftmaxChecker, \
-    SoftmaxCrossEntropyLossChecker, TransposeChecker
+from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatGemmTemplate, \
+    FloatMulTemplate, FloatReduceSumTemplate, FloatSoftmaxTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, \
+    iHardswishTemplate
+from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
+    GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
+    QuantChecker, ReduceMeanChecker, ReluChecker, RQAddChecker, RQHardswishChecker, SGDChecker, SliceChecker, \
+    SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled
 from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
-from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatConvTemplate, FloatMaxPoolTemplate, \
-    FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, \
-    RequantShiftTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SliceTemplate, \
-    SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
-    iRMSNormTemplate, iSoftmaxTemplate
+from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatAddTemplate, FloatConvTemplate, FloatGELUTemplate, \
+    FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatReluTemplate, FloatSoftmaxTemplate, \
+    GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, \
+    RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SliceTemplate, SoftmaxCrossEntropyLossTemplate, \
+    TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
 from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
     PULPRequantShiftChecker
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement
@@ -172,6 +172,16 @@
     for _type3 in [int8_t, uint8_t]
 ]
 
+PULPAddBindings = [
+    NodeBinding(AddChecker([PointerClass(type1), PointerClass(type2)], [PointerClass(int32_t)]),
+                AddTemplate.referenceTemplate, ForkTransformer)
+    for type1 in IntegerDataTypes
+    for type2 in IntegerDataTypes
+] + [
+    NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatAddTemplate.referenceTemplate, ForkTransformer)
+]
+
 PULPRQSConv2DBindings = [
     NodeBinding(
         PULPConvChecker([
@@ -215,7 +225,7 @@
 PULPFloatConv2DBindings = [
     NodeBinding(
         ConvChecker([PointerClass(float32_t), PointerClass(float32_t),
-                     PointerClass(float32_t)], [PointerClass(float32_t)]), FloatConvTemplate.reference2DTemplate,
+                     PointerClass(float32_t)], [PointerClass(float32_t)]), FloatConvTemplate.reference2DIm2ColTemplate,
         ForkTransformer)
 ]
 
@@ -264,7 +274,7 @@
                 GEMMTemplate.PULPMM_8_Template, ClusterTransformer)
 ] + [
     NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
-                FloatMatMulTemplate.referenceTemplate, ClusterTransformer)
+                FloatMatMulTemplate.referenceTemplate, ForkTransformer)
 ]
 
 PULPReduceMeanBindings = [
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
new file mode 100644
index 0000000000..850de69e55
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py
@@ -0,0 +1,50 @@
+# ----------------------------------------------------------------------
+#
+# File: FloatAddTemplate.py
+#
+# Last edited: 13.11.2024
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Authors:
+# - Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Add Parallel with 1x6 unrolling (Name: ${nodeName}, Op: ${nodeOp})
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
+int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size});
+
+uint32_t i = ${nodeName}_chunk_start;
+for (; i+5 < ${nodeName}_chunk_stop; i+=6) {
+    ${data_out}[i] = ${data_in_1}[i] + ${data_in_2}[i];
+    ${data_out}[i+1] = ${data_in_1}[i+1] + ${data_in_2}[i+1];
+    ${data_out}[i+2] = ${data_in_1}[i+2] + ${data_in_2}[i+2];
+    ${data_out}[i+3] = ${data_in_1}[i+3] + ${data_in_2}[i+3];
+    ${data_out}[i+4] = ${data_in_1}[i+4] + ${data_in_2}[i+4];
+    ${data_out}[i+5] = ${data_in_1}[i+5] + ${data_in_2}[i+5];
+}
+
+for (; i < ${nodeName}_chunk_stop; i++) {
+    ${data_out}[i] = ${data_in_1}[i] + ${data_in_2}[i];
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
index 9e8ec57643..0a368fd413 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
@@ -23,26 +23,115 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from Deeploy.DeeployTypes import NodeTemplate
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class PULP2DFloatConvIm2ColTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    @staticmethod
+    def computeTransientBuffersSize(
+            ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+        im2col_dim = 4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] *
+                              operatorRepresentation['dim_kernel_y'])
+        im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+        return [(im2col_name, im2col_dim)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        im2col_name, im2col_dim = PULP2DFloatConvIm2ColTemplate.computeTransientBuffersSize(
+            ctxt, operatorRepresentation)[0]
+        ctxt.hoistTransientBuffer(im2col_name, im2col_dim)
+
+        operatorRepresentation['ctxtBuffer'] = im2col_name
+        operatorRepresentation['ctxtBufferSize'] = im2col_dim
+        return ctxt, operatorRepresentation, [im2col_name]
+
 
 reference2DTemplate = NodeTemplate("""
 
-// 2D FP Conv HWC (Name: ${nodeName}, Op: ${nodeOp})
-BEGIN_SINGLE_CORE
-    ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
-    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
-
-    for (uint32_t n=0; n<${batch}; ++n) {
-        Conv2d_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
-            ref_${data_out}_${data_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in},
-            ${weight}, ${ch_im_out},
-            ${dim_kernel_x}, ${dim_kernel_y},                                                                           
-            ${stride_x}, ${stride_y},                                              
-            ref_${data_out}_${data_out},                                           
-            ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}                  
-        );
-        ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y};
-        ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y};
-    }
-END_SINGLE_CORE
+// 2D FP Conv HWC Parallel (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int16_t ${nodeName}_ch_out_chunk = (${ch_im_out} >> ${nodeName}_log2Core) + ((${ch_im_out} & (NUM_CORES-1))!=0);
+int16_t ${nodeName}_ch_out_start = MIN(${nodeName}_ch_out_chunk*${nodeName}_core_id, ${ch_im_out});
+int16_t ${nodeName}_ch_out_stop = MIN(${nodeName}_ch_out_start + ${nodeName}_ch_out_chunk, ${ch_im_out});
+int16_t ${nodeName}_ch_out_count = ${nodeName}_ch_out_stop - ${nodeName}_ch_out_start;
+
+${weight_type.typeName} ${nodeName}_weight_ptr = ${weight} + ${nodeName}_ch_out_start * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y};
+
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+                                   
+
+for (uint32_t n=0; n<${batch}; ++n) {
+
+    Conv2d_ChannelRange_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
+        ref_${data_out}_${data_in}, ${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in},
+        ${nodeName}_weight_ptr, ${nodeName}_ch_out_count,
+        ${dim_kernel_y}, ${dim_kernel_x},
+        ${stride_y}, ${stride_x},
+        ref_${data_out}_${data_out}, ${ch_im_out}, ${nodeName}_ch_out_start,
+        ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}
+    );
+    
+
+    ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
+    ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
+}
+
+""")
+
+reference2DIm2ColTemplate = PULP2DFloatConvIm2ColTemplate("""
+// 2D FP Conv HWC Parallel with Im2Col (Name: ${nodeName}, Op: ${nodeOp})
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int16_t ${nodeName}_ch_out_chunk = (${ch_im_out} >> ${nodeName}_log2Core) + ((${ch_im_out} & (NUM_CORES-1))!=0);
+int16_t ${nodeName}_ch_out_start = MIN(${nodeName}_ch_out_chunk*${nodeName}_core_id, ${ch_im_out});
+int16_t ${nodeName}_ch_out_stop = MIN(${nodeName}_ch_out_start + ${nodeName}_ch_out_chunk, ${ch_im_out});
+int16_t ${nodeName}_ch_out_count = ${nodeName}_ch_out_stop - ${nodeName}_ch_out_start;
+
+${weight_type.typeName} ${nodeName}_weight_ptr = ${weight} + ${nodeName}_ch_out_start * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y};
+
+
+uint32_t ${nodeName}_im2col_size_per_core = ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y};
+${data_out_type.typeName} ${nodeName}_im2col_buffer = ((${data_out_type.typeName})${ctxtBuffer}) + ${nodeName}_core_id * ${nodeName}_im2col_size_per_core;
+                                                          
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {   
+    
+    Conv2d_Im2Col_ChannelRange_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
+        ref_${data_out}_${data_in},            
+        ${dim_im_in_y},                      
+        ${dim_im_in_x},                      
+        ${ch_im_in},                          
+        ${nodeName}_weight_ptr,               
+        ${nodeName}_ch_out_count,            
+        ${dim_kernel_y},                      
+        ${dim_kernel_x},                      
+        ${stride_y},                          
+        ${stride_x},                          
+        ref_${data_out}_${data_out},         
+        ${ch_im_out},                         
+        ${nodeName}_ch_out_start,             
+        ${padding_y_top},                    
+        ${padding_y_bottom},                  
+        ${padding_x_left},                    
+        ${padding_x_right},                   
+        ${nodeName}_im2col_buffer             
+    );
+
+    ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
+    ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
+}
 """)
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py
new file mode 100644
index 0000000000..40890b3426
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py
@@ -0,0 +1,37 @@
+# ----------------------------------------------------------------------
+#
+# File: FloatGELUTemplate.py
+#
+# Last edited: 04.05.2025
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// GELU Parallel (Name: ${nodeName}, Op: ${nodeOp})
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
+int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size});
+
+GELU_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_sigmoid_chunk(${data_in}, ${data_out}, ${nodeName}_chunk_start, ${nodeName}_chunk_stop);
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py
new file mode 100644
index 0000000000..ccb4c03751
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py
@@ -0,0 +1,62 @@
+# ----------------------------------------------------------------------
+#
+# File: FloatLayernormTemplate.py
+#
+# Last edited: 23.01.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// FloatLayernorm Parallel (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+
+int32_t ${nodeName}_seq_length = ${size} / ${lastDimLength};
+int32_t ${nodeName}_chunk = (${nodeName}_seq_length >> ${nodeName}_log2Core) + 
+                          ((${nodeName}_seq_length & (NUM_CORES-1)) != 0);
+int32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, ${nodeName}_seq_length);
+int32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${nodeName}_seq_length);
+
+
+int32_t ${nodeName}_elem_start = ${nodeName}_start * ${lastDimLength};
+int32_t ${nodeName}_elem_end = ${nodeName}_end * ${lastDimLength};
+int32_t ${nodeName}_elem_count = ${nodeName}_elem_end - ${nodeName}_elem_start;
+
+
+const float* ${nodeName}_data_in_ptr = ${data_in} + ${nodeName}_elem_start;
+float* ${nodeName}_data_out_ptr = ${data_out} + ${nodeName}_elem_start;
+
+
+if (${nodeName}_elem_count > 0) {
+    Layernorm_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
+        ${nodeName}_data_in_ptr, 
+        ${nodeName}_data_out_ptr, 
+        ${weight}, 
+        ${bias}, 
+        ${epsilon}, 
+        ${nodeName}_elem_count, 
+        ${lastDimLength}
+    );
+}
+
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
new file mode 100644
index 0000000000..7d558b7100
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py
@@ -0,0 +1,53 @@
+# ----------------------------------------------------------------------
+#
+# File: F；FloatMatMul.py
+#
+# Last edited: 28.03.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the Licens
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Matmul with row parallelism (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int32_t ${nodeName}_M_chunk = (${M} >> ${nodeName}_log2Core) + ((${M} & (NUM_CORES-1))!=0);
+int32_t ${nodeName}_M_start = MIN(${nodeName}_core_id * ${nodeName}_M_chunk, ${M});
+int32_t ${nodeName}_M_end = MIN(${nodeName}_M_start + ${nodeName}_M_chunk, ${M});
+int32_t ${nodeName}_M_size = ${nodeName}_M_end - ${nodeName}_M_start;
+                                 
+for(uint32_t b=0; b<${batch}; b++) {
+    ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
+    ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
+    ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
+    
+    if (${nodeName}_M_size > 0) {
+        MatMul_fp32_fp32_fp32_unroll1x7(
+            batch_A + ${nodeName}_M_start * ${N},  
+            batch_B,                              
+            batch_out + ${nodeName}_M_start * ${O}, 
+            ${nodeName}_M_size,                    
+            ${N},                                  
+            ${O}                                  
+        );
+    }
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py
index 5c58ed6723..fd1e83b9b1 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py
@@ -26,23 +26,29 @@
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
+// 2D Float MaxPool Channel Parallel (Name: ${nodeName}, Op: ${nodeOp})
 
-// 2D Float MaxPool (Name: ${nodeName}, Op: ${nodeOp})
-BEGIN_SINGLE_CORE
-   ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
-   ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int16_t ${nodeName}_ch_chunk = (${ch_im_in} >> ${nodeName}_log2Core) + ((${ch_im_in} & (NUM_CORES-1))!=0);
+int16_t ${nodeName}_ch_start = MIN(${nodeName}_ch_chunk*${nodeName}_core_id, ${ch_im_in});
+int16_t ${nodeName}_ch_stop = MIN(${nodeName}_ch_start + ${nodeName}_ch_chunk, ${ch_im_in});
+int16_t ${nodeName}_ch_count = ${nodeName}_ch_stop - ${nodeName}_ch_start;
 
-   for (uint32_t n=0; n<${batch}; ++n) {
-       MaxPool2d_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
-           ref_${data_out}_${data_in}, 
-           ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in},
-           ${dim_kernel_x}, ${dim_kernel_y}, 
-           ${stride_x}, ${stride_y},
-           ref_${data_out}_${data_out},
-           ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}      
-       );
-        ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y};
-        ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y};
-   }
-END_SINGLE_CORE
-""")
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    MaxPool2d_ChannelRange_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC(
+        ref_${data_out}_${data_in}, 
+        ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in},
+        ${dim_kernel_x}, ${dim_kernel_y}, 
+        ${stride_x}, ${stride_y},
+        ref_${data_out}_${data_out},
+        ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right},
+        ${nodeName}_ch_start, ${nodeName}_ch_count
+    );
+    ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y};
+    ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y};
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py
new file mode 100644
index 0000000000..a6e93ae6ae
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py
@@ -0,0 +1,44 @@
+# ----------------------------------------------------------------------
+#
+# File: FloatReluTemplate.py
+#
+# Last edited: 04.05.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Parallel ReLU (Name: ${nodeName}, Op: ${nodeOp})
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0);
+int32_t ${nodeName}_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size});
+int32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${size});
+int32_t ${nodeName}_local_size = ${nodeName}_end - ${nodeName}_start;
+
+if (${nodeName}_local_size > 0) {
+    Relu_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
+        ${data_in} + ${nodeName}_start,
+        ${data_out} + ${nodeName}_start,
+        ${nodeName}_local_size
+    );
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py
index be2fbc796c..01edb04676 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py
@@ -26,8 +26,25 @@
 from Deeploy.DeeployTypes import NodeTemplate
 
 referenceTemplate = NodeTemplate("""
-// Softmax (Name: ${nodeName}, Op: ${nodeOp})
-SINGLE_CORE Softmax_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${lastDimLength});
+// Softmax Parallel (Name: ${nodeName}, Op: ${nodeOp})
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_log2Core = log2(NUM_CORES);
+int32_t ${nodeName}_num_vectors = ${size} / ${lastDimLength};
+int32_t ${nodeName}_chunk = (${nodeName}_num_vectors >> ${nodeName}_log2Core) + ((${nodeName}_num_vectors & (NUM_CORES-1))!=0);
+int32_t ${nodeName}_vector_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${nodeName}_num_vectors);
+int32_t ${nodeName}_vector_end = MIN(${nodeName}_vector_start + ${nodeName}_chunk, ${nodeName}_num_vectors);
+int32_t ${nodeName}_local_size = (${nodeName}_vector_end - ${nodeName}_vector_start) * ${lastDimLength};
+
+if (${nodeName}_local_size > 0) {
+    int32_t ${nodeName}_data_offset = ${nodeName}_vector_start * ${lastDimLength};
+    
+    Softmax_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(
+        ${data_in} + ${nodeName}_data_offset,
+        ${data_out} + ${nodeName}_data_offset,
+        ${nodeName}_local_size,
+        ${lastDimLength}
+    );
+}
 """)
 
 referenceGradientTemplate = NodeTemplate("""
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
index 457bd3fda7..ea94e2db7a 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
@@ -355,25 +355,17 @@ def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkCo
         strides = parseDict["strides"]
         padding = parseDict["pads"]
 
-        # VIC: Force at least one row of A and one col of B in the GEMM (since it's a im2col Conv) to avoid partial results
+        # RW: Conv only tiled on outchannel
+        tilerModel.addConstraint(inputHeightVar == parseDict['dim_im_in_x'])
+        tilerModel.addConstraint(inputWidthVar == parseDict['dim_im_in_y'])
         tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in'])
 
-        if (parseDict["ch_im_out"] >= 8):
-            tilerModel.addMinTileSizeConstraint(parseDict, 'ch_im_out', outputChannelVar, 8)
-
-        tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x'])
-        tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y'])
-        tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in'])
-
-        # VIC: Constraint the minimum tile size such that we can apply at least one kernel on it
-        tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x'])
-        tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y'])
-
         tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x'])
         tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y'])
+        tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in'])
 
-        tilerModel.addConstraint((inputHeightVar % strides[0]) == 0)
-        tilerModel.addConstraint((inputWidthVar % strides[1]) == 0)
+        if (parseDict["ch_im_out"] >= 8):
+            tilerModel.addMinTileSizeConstraint(parseDict, 'ch_im_out', outputChannelVar, 8)
 
         return tilerModel
 
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py
index 7f8a456265..b72cc9115e 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py
@@ -235,6 +235,7 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
 
         dimOffsetA = len(bufferA.shape) - 2
         dimOffsetB = len(bufferB.shape) - 2
+        dimOffsetC = len(bufferC.shape) - 2
         dimOffsetOut = len(outputBuffer.shape) - 2
 
         AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
@@ -253,8 +254,8 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         # Add GEMM Geometrical constraints
         tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
 
-        addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = 0)
-        addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = 1)
+        addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
+        addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
         tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
         tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
 
diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py
index 21d638d0b8..78fb77cf77 100644
--- a/Deeploy/Targets/PULPOpen/Tiler.py
+++ b/Deeploy/Targets/PULPOpen/Tiler.py
@@ -28,7 +28,7 @@
 
 from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import MemoryPassthroughGeneration
 from Deeploy.DeeployTypes import CodeTransformation
-from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicReshapeBindings
+from Deeploy.Targets.Generic.Bindings import BasicReshapeBindings
 from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint
@@ -40,13 +40,14 @@
 from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint
-from Deeploy.Targets.PULPOpen.Bindings import ForkTransformer, PULPConcatBindings, PULPFloatConv2DBindings, \
+from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \
     PULPFloatGELUBinding, PULPFloatGEMMBindings, PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, \
     PULPiRQSGELUBindings, PULPLayernormBinding, PULPMatMulBindings, PULPMaxPool2DBindings, PULPMulBindings, \
     PULPReduceSumBindings, PULPReluBinding, PULPRQAddBindings, PULPRQSBindings, PULPRQSConv2DBindings, \
     PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, \
     PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings, \
-    PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings
+    PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, PULPTransposeBindings, \
+    PULPUniformRQSBindings
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint
@@ -114,11 +115,7 @@
 PULPTransposeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPTransposeBindings,
                                                            tileConstraint = TransposeTileConstraint())
 
-_PULPAddBindings = copy.deepcopy(BasicAddBindings)
-for binding in _PULPAddBindings:
-    binding.codeTransformer = ForkTransformer
-
-PULPAddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = _PULPAddBindings,
+PULPAddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPAddBindings,
                                                      tileConstraint = AddTileConstraint())
 
 PULPSoftmaxTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSoftmaxBindings,
diff --git a/Deeploy/Targets/Redmule/Bindings.py b/Deeploy/Targets/Redmule/Bindings.py
new file mode 100644
index 0000000000..df811b8b5f
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Bindings.py
@@ -0,0 +1,52 @@
+# ----------------------------------------------------------------------
+#
+# File: NeurekaBindings.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# Luka Macan, University of Bologna
+# Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import  float32_t
+from Deeploy.DeeployTypes import NodeBinding
+from Deeploy.Targets.Generic.TypeCheckers import MatMulChecker, ConvChecker, GEMMChecker
+from Deeploy.Targets.Redmule.Templates import MatmulTemplate, ConvTemplate, GEMMTemplate  
+from Deeploy.Targets.PULPOpen.Bindings import ForkTransformer
+
+RedmuleMatmulBindings =  [
+    NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                MatmulTemplate.referenceTemplate, ForkTransformer)
+]
+
+RedmuleConv2DBindings = [
+    NodeBinding(
+        ConvChecker([PointerClass(float32_t), PointerClass(float32_t),
+                     PointerClass(float32_t)], [PointerClass(float32_t)]), ConvTemplate.reference2DIm2ColTemplate,
+        ForkTransformer)
+]
+
+RedmuleGEMMBindings = [
+    NodeBinding(
+        GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
+                     PointerClass(float32_t)], [PointerClass(float32_t)]), GEMMTemplate.referenceTemplate,
+        ForkTransformer)
+]
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/Deployer.py b/Deeploy/Targets/Redmule/Deployer.py
new file mode 100644
index 0000000000..23b322bbfc
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Deployer.py
@@ -0,0 +1,52 @@
+# ----------------------------------------------------------------------
+#
+# File: Deployer.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
+from Deeploy.Targets.Redmule.TopologyOptimizationPasses.Passes import RedMuleAdjustWeightMemoryLayoutPass, RedMuleGEMMTransposePass
+class RedmuleDeployer(PULPDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda graph: list(graph.nodes),
+                 name: str = 'DeeployNetwork',
+                 default_channels_first = False,
+                 deeployStateDir: str = "DeeployStateDir",
+                 inputOffsets = {}):
+        super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
+                         default_channels_first, deeployStateDir, inputOffsets)
+        
+        self.loweringOptimizer.passes += [
+            RedMuleAdjustWeightMemoryLayoutPass("Redmule"),
+            RedMuleGEMMTransposePass("Redmule")
+        ]
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/Engine.py b/Deeploy/Targets/Redmule/Engine.py
new file mode 100644
index 0000000000..1022362c57
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Engine.py
@@ -0,0 +1,65 @@
+# ----------------------------------------------------------------------
+#
+# File: Engine.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+import onnx_graphsurgeon as gs
+from Deeploy.Targets.Generic.Layers import GEMMLayer
+from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper
+from Deeploy.Targets.Generic.Layers import MatMulLayer, ConvLayer
+from Deeploy.Targets.Generic.Parsers import MatMulParser
+from Deeploy.Targets.Redmule.Tiler import RedmuleMatMulTilingReadyBindings, RedmuleConvTilingReadyBindings, RedmuleGEMMTilingReadyBindings
+from Deeploy.Targets.PULPOpen.Parsers import PULPFPConv2DParser
+from Deeploy.Targets.Redmule.Parsers import GEMMRedmuleParser
+
+MatMulRedmuleMapper = NodeMapper(
+    MatMulParser(), RedmuleMatMulTilingReadyBindings)
+Conv2DRedmuleMapper = NodeMapper(
+    PULPFPConv2DParser(), RedmuleConvTilingReadyBindings)
+GEMMMRedmuleMapper = NodeMapper(GEMMRedmuleParser(), RedmuleGEMMTilingReadyBindings)
+
+RedmuleMapping = {
+    'MatMul': MatMulLayer([MatMulRedmuleMapper]),
+    'Conv': ConvLayer([Conv2DRedmuleMapper]),
+    'Gemm': GEMMLayer([GEMMMRedmuleMapper]),
+}
+
+_includeList = []
+
+_redmuleInitCode = r"""
+// Redmule engine initialization
+"""
+
+
+class RedmuleEngine(DeploymentEngine):
+
+    def __init__(self,
+                 name: str,
+                 Mapping = RedmuleMapping,
+                 initCode: str = _redmuleInitCode,
+                 includeList: List[str] = _includeList) -> None:
+        super().__init__(name, Mapping, initCode, includeList)
+
+
diff --git a/Deeploy/Targets/Redmule/Parsers.py b/Deeploy/Targets/Redmule/Parsers.py
new file mode 100644
index 0000000000..383fe59f31
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Parsers.py
@@ -0,0 +1,99 @@
+# ----------------------------------------------------------------------
+#
+# File: BasicParsers.py
+#
+# Last edited: 15.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Authors:
+# - Moritz Scherer, ETH Zurich
+# - Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from typing import Tuple
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import NetworkContext, NodeParser
+from Deeploy.Targets.Generic.Parsers import MatMulParser
+
+class GEMMRedmuleParser(MatMulParser):
+
+    def __init__(self, noBiasHoisting = True):
+        self.noBiasHoisting = noBiasHoisting
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all([
+            len(node.inputs) >= 2,
+            len(node.outputs) == 1,
+            node.attrs['alpha'] == 1
+        ])
+
+        if ret:
+            if 'transA' in node.attrs:
+                self.operatorRepresentation['transA'] = node.attrs['transA']
+            else:
+                self.operatorRepresentation['transA'] = 0
+
+            if 'transB' in node.attrs:
+                self.operatorRepresentation['transB'] = node.attrs['transB']
+            else:
+                self.operatorRepresentation['transB'] = 0
+            if 'alpha' in node.attrs:
+                self.operatorRepresentation['alpha'] = node.attrs['alpha']
+            else:
+                self.operatorRepresentation['alpha'] = 1
+            if 'beta' in node.attrs:
+                self.operatorRepresentation['beta'] = node.attrs['beta']
+            else:
+                self.operatorRepresentation['beta'] = 1
+        
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['A', 'B']
+            outputs = ['data_out']
+
+            for idx, inputNode in enumerate(node.inputs):
+                if idx < len(inputs):
+                    self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+            for idx, outputNode in enumerate(node.outputs):
+                self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name
+
+            if len(node.inputs) == 3:
+                self.operatorRepresentation['C'] = newCtxt.lookup(node.inputs[2].name).name
+            elif not self.noBiasHoisting:
+                values = np.zeros((1))
+                zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values)
+                newCtxt.hoistConstant(zeroTensor)
+                self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
+
+            self.operatorRepresentation['size'] = np.prod(newCtxt.lookup(node.inputs[0].name).shape)
+
+        return newCtxt, ret
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/Platform.py b/Deeploy/Targets/Redmule/Platform.py
new file mode 100644
index 0000000000..c0587a4ead
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Platform.py
@@ -0,0 +1,45 @@
+# ----------------------------------------------------------------------
+#
+# File: Platform.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import TopologyOptimizer
+from Deeploy.Targets.Redmule.Engine import RedmuleEngine
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, \
+    PULPOptimizer, PULPPlatform, PULPStructBuffer, PULPTransientBuffer, PULPVariableBuffer, PULPConstantBuffer
+
+RedmuleOptimizer = TopologyOptimizer([
+    *PULPOptimizer.passes
+])
+
+class RedmulePlatform(PULPPlatform):
+
+    def __init__(self,
+                 engines = [RedmuleEngine("Redmule"), PULPClusterEngine("PULPCluster")],
+                 variableBuffer = PULPVariableBuffer,
+                 constantBuffer = PULPConstantBuffer,
+                 structBuffer = PULPStructBuffer,
+                 transientBuffer = PULPTransientBuffer) -> None:
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+
+
diff --git a/Deeploy/Targets/Redmule/Templates/ConvTemplate.py b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py
new file mode 100644
index 0000000000..5ad5f51e5f
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py
@@ -0,0 +1,88 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTemplate.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class RedmuleFloatConvIm2ColTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    @staticmethod
+    def computeTransientBuffersSize(
+            ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+        im2col_dim =  4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] *
+                              operatorRepresentation['dim_kernel_y'])
+        im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+        return [(im2col_name, im2col_dim)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        im2col_name, im2col_dim = RedmuleFloatConvIm2ColTemplate.computeTransientBuffersSize(ctxt, operatorRepresentation)[0]
+        ctxt.hoistTransientBuffer(im2col_name, im2col_dim)
+
+        operatorRepresentation['ctxtBuffer'] = im2col_name
+        operatorRepresentation['ctxtBufferSize'] = im2col_dim
+        return ctxt, operatorRepresentation, [im2col_name]
+
+
+reference2DIm2ColTemplate = RedmuleFloatConvIm2ColTemplate   ("""
+// 2D FP Conv HWC Parallel with Im2Col (Name: ${nodeName}, Op: ${nodeOp})                                               
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {   
+    
+    Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC_8_Redmule(
+        ref_${data_out}_${data_in},            
+        ${dim_im_in_y},                      
+        ${dim_im_in_x},                      
+        ${ch_im_in},                          
+        ${weight},                       
+        ${dim_kernel_y},                      
+        ${dim_kernel_x},                      
+        ${stride_y},                          
+        ${stride_x},                          
+        ref_${data_out}_${data_out},         
+        ${ch_im_out},                                   
+        ${padding_y_top},                    
+        ${padding_y_bottom},                  
+        ${padding_x_left},                    
+        ${padding_x_right},                   
+        ${ctxtBuffer}       
+    );
+
+    ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
+    ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py
new file mode 100644
index 0000000000..1ac45c3e6d
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py
@@ -0,0 +1,62 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMul.py.py
+#
+# Last edited: 27.01.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the Licens
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// GEMM using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+
+if (${nodeName}_core_id == 0) {
+    for(uint32_t b=0; b<${batch}; b++) {
+        ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
+        ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
+        ${C_type.typeName} batch_C = ${C} + b * ${M} * ${O};
+        ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
+        
+        % if beta == 0:
+        MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule(
+            (const float32_t *) batch_A,
+            (const float32_t *) batch_B,
+            (float32_t *) batch_out,
+            ${M},
+            ${N},
+            ${O}
+        );
+        % else:
+        Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule(
+            (const float32_t *) batch_A,
+            (const float32_t *) batch_B,
+            (const float32_t *) batch_C,
+            (float32_t *) batch_out,
+            ${M},
+            ${N},
+            ${O}
+        );
+        % endif
+    }
+}
+"""
+)
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py
new file mode 100644
index 0000000000..cb077ca897
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMul.py.py
+#
+# Last edited: 27.01.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the Licens
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Matmul using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_num_cores = NUM_CORES;
+
+if (${nodeName}_core_id == 0) {
+    for(uint32_t b=0; b<${batch}; b++) {
+        ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
+        ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
+        ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
+
+        MatMul_fp32_fp32_fp32_Redmule(
+            (const float32_t *) batch_A,
+            (const float32_t *) batch_B,
+            (float32_t *) batch_out,
+            ${M},
+            ${N},
+            ${O}
+        );
+    }
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/Templates/__init__.py b/Deeploy/Targets/Redmule/Templates/__init__.py
new file mode 100644
index 0000000000..a73187ca8f
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py
new file mode 100644
index 0000000000..61ef736773
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py
@@ -0,0 +1,283 @@
+
+
+# ----------------------------------------------------------------------
+#
+# File: ConvTileConstraint.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel, PerformanceHint
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+
+class RedmuleConv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        outputBufferName = parseDict['data_out']
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+        dilation = parseDict["dilations"]
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        # Updated dimension indexes for (H, W, Cin, Cout) format
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
+        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel (now at index 3)
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
+        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
+
+        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (weightHeightVar - 1) - 1) // strides[0] + 1))
+        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (weightWidthVar - 1) - 1) // strides[1] + 1))
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        # Updated dimension indexes for (H, W, Cin, Cout) format
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1)
+        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2)
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 3)
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+
+        tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in'])
+        # RW: Conv only tiled on outchannel 
+        tilerModel.addConstraint(inputHeightVar == parseDict['dim_im_in_x'])
+        tilerModel.addConstraint(inputWidthVar == parseDict['dim_im_in_y'])
+        tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in'])
+
+        tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x'])
+        tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y'])
+        tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in'])
+
+        outChannel = parseDict["ch_im_out"]
+        if outChannel >= 12:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "ch_im_out",
+                                                      weightOutChannelVar,
+                                                      12,
+                                                      strategy = PerformanceHint(priority = 1))
+        else:
+            tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max(), strategy = PerformanceHint(1))
+        
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+        symbolicParseDict = parseDict.copy()
+        symbolicParseDict['dim_im_in_x'] = tilerModel.getTensorDimVar(inputBuffer.name, 1)
+        # Using updated dimension indexes for kernel dimensions
+        symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 0)
+        symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 1)
+
+        return symbolicParseDict
+
+    @staticmethod
+    def computeMargins(kernelShape: Tuple[int, ...]) -> Tuple[int, ...]:
+        if kernelShape[1] % 2 == 0:
+            leftMargin = 0
+            rightMargin = 0
+        else:
+            leftMargin = ((kernelShape[1]) // 2)
+            rightMargin = ((kernelShape[1]) // 2)
+
+        if kernelShape[0] % 2 == 0:
+            topMargin = 0
+            bottomMargin = 0
+        else:
+            topMargin = ((kernelShape[0]) // 2)
+            bottomMargin = ((kernelShape[0]) // 2)
+
+        return leftMargin, rightMargin, topMargin, bottomMargin
+
+    @staticmethod
+    def computeInputCube(kernelShape: Tuple[int, ...], pads: Tuple[int, ...], strides: Tuple[int, ...],
+                         weightChannels: int, outputCube: HyperRectangle,
+                         outputDims: Tuple[int, ...]) -> Tuple[HyperRectangle, Tuple[int, ...]]:
+
+        (BatchOffset, HOffset, WOffset, COffset) = outputCube.offset
+        (BatchSize, HSize, WSize, CSize) = outputCube.dims
+
+        leftMargin, rightMargin, topMargin, bottomMargin = RedmuleConv2DTileConstraint.computeMargins(kernelShape)
+
+        padding_top = (HOffset == 0) * pads[0]
+        padding_bottom = (HOffset + HSize == outputDims[1]) * pads[2]
+
+        padding_left = (WOffset == 0) * pads[1]
+        padding_right = (WOffset + WSize == outputDims[2]) * pads[3]
+
+        inputHOffset = HOffset * strides[0] - topMargin * (HOffset != 0)
+        inputWOffset = WOffset * strides[1] - leftMargin * (WOffset != 0)
+
+        inputHSize = HSize * strides[0] + (topMargin + bottomMargin) - (padding_top + padding_bottom)
+        inputWSize = WSize * strides[1] + (leftMargin + rightMargin) - (padding_left + padding_right)
+
+        InCube = HyperRectangle((BatchOffset, inputHOffset, inputWOffset, 0),
+                                (BatchSize, inputHSize, inputWSize, weightChannels))
+
+        return InCube, (padding_left, padding_right, padding_top, padding_bottom)
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'weight', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varWeight = operatorRepresentation['weight']
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        inputWeightCubes = []
+        replacements: Dict[str, List[int]] = {
+            "dim_im_in_x": [],
+            "dim_im_in_y": [],
+            "dim_im_out_x": [],
+            "dim_im_out_y": [],
+            "ch_im_out": [],
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": []
+        }
+
+        replacementTypes = {
+            "dim_im_in_x": PointerClass(uint16_t),
+            "dim_im_in_y": PointerClass(uint16_t),
+            "dim_im_out_x": PointerClass(uint16_t),
+            "dim_im_out_y": PointerClass(uint16_t),
+            "ch_im_out": PointerClass(uint16_t),
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t)
+        }
+
+        # Updated dimension indexes for (H, W, Cin, Cout) format
+        weightH = ctxt.lookup(varWeight).shape[0]  # Now index 0
+        weightW = ctxt.lookup(varWeight).shape[1]  # Now index 1
+        weightC = ctxt.lookup(varWeight).shape[2]  # Now index 2 (Cin)
+
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for cube in outputCubes:
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            InCube, padding_tuple = RedmuleConv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC,
+                                                                          cube,
+                                                                          ctxt.lookup(varOut).shape)
+
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            replacements['dim_im_in_x'].append(InCube.dims[1])
+            replacements['dim_im_in_y'].append(InCube.dims[2])
+            replacements['dim_im_out_x'].append(HSize)
+            replacements['dim_im_out_y'].append(WSize)
+            replacements['ch_im_out'].append(CSize)
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inputInCubes.append(InCube)
+
+            # Updated WeightCube for (H, W, Cin, Cout) format
+            # COffset is now applied to dimension 3 (Cout)
+            WeightCube = HyperRectangle((0, 0, 0, COffset), (weightH, weightW, weightC, CSize))
+
+            inputWeightCubes.append(WeightCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a, b in zip(inputInCubes, inputWeightCubes):
+            inputLoadSchedule.append({"data_in": a, "weight": b})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py
new file mode 100644
index 0000000000..a91a0b929c
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py
@@ -0,0 +1,198 @@
+
+# ----------------------------------------------------------------------
+#
+# File: GEMMTileConstraint.py
+#
+# Last edited: 02.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class RedmuleGEMMTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+        bufferC = ctxt.lookup(name = parseDict['C'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [bufferA.name, bufferB.name, bufferC.name, outputBuffer.name]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        dimOffsetA = len(bufferA.shape) - 2
+        dimOffsetB = len(bufferB.shape) - 2
+        dimOffsetC = len(bufferC.shape) - 2
+        dimOffsetOut = len(outputBuffer.shape) - 2
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = dimOffsetA + 1 - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = dimOffsetB + 1 - parseDict['transB'])
+        outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut)
+        outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut + 1)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar)
+        tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar)
+
+        # Add GEMM Geometrical constraints
+        tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
+
+        addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
+        addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
+        tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
+        tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        from Deeploy.TilingExtension.TilerModel import PerformanceHint
+
+        bufferA = ctxt.lookup(name=parseDict['A'])
+        bufferB = ctxt.lookup(name=parseDict['B'])
+
+        tensorsShapeLen = min(len(bufferA.shape), len(bufferB.shape))
+        
+        dimOffsetA = len(bufferA.shape) - 2
+        dimOffsetB = len(bufferB.shape) - 2
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName=bufferA.name, dimIdx=dimOffsetA + parseDict['transA'])
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName=bufferA.name, dimIdx=dimOffsetA + 1 - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName=bufferB.name, dimIdx=dimOffsetB + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName=bufferB.name, dimIdx=dimOffsetB + 1 - parseDict['transB'])
+
+        # VIC: We don't want to deal with intermediate results between kernel calls
+        tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
+        tilerModel.addConstraint(BFirstDimVar == parseDict['N'])
+
+        tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy=PerformanceHint(1))
+        tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy=PerformanceHint(1))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['A', 'B', 'C', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        transA = operatorRepresentation['transA']
+        transB = operatorRepresentation['transB']
+
+        varA = operatorRepresentation['A']
+        varB = operatorRepresentation['B']
+
+        if transA == 0:
+            NSize = ctxt.lookup(varA).shape[-1]
+        else:
+            NSize = ctxt.lookup(varA).shape[-2]
+
+        NOffset = 0
+
+        inputACubes = []
+        inputBCubes = []
+        inputAddCubes = []
+
+        replacements = {"M": [], "O": [], "batch": []}
+
+        # Every output is constructed by a pair of inputs. Reconstruct this pair.
+        for cube in outputCubes:
+
+            BSize = 1
+            BOffset = 0
+            BatchSize = 1
+            BatchOffset = 0
+
+            if len(cube.offset) == 2:
+                (MOffset, OOffset) = cube.offset
+                (MSize, OSize) = cube.dims
+            elif len(cube.offset) == 3:
+                (BatchOffset, MOffset, OOffset) = cube.offset
+                (BatchSize, MSize, OSize) = cube.dims
+            else:
+                (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
+                (BatchSize, BSize, MSize, OSize) = cube.dims
+
+            replacements["M"].append(MSize)
+            replacements["O"].append(OSize)
+            replacements["batch"].append(BSize)
+
+            if transA == 0:
+                ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize))
+            else:
+                ACube = HyperRectangle((BatchOffset, BOffset, NOffset, MOffset), (BatchSize, BSize, NSize, MSize))
+
+            if transB == 0:
+                BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize))
+            else:
+                BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize))
+
+            CCube = HyperRectangle(cube.offset, cube.dims)
+
+            inputACubes.append(ACube)
+            inputBCubes.append(BCube)
+            inputAddCubes.append(CCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        replacements["N"] = [NSize] * len(outputCubes)
+
+        replacementTypes = {
+            "M": PointerClass(uint16_t),
+            "N": PointerClass(uint16_t),
+            "O": PointerClass(uint16_t),
+            "batch": PointerClass(uint8_t)
+        }
+
+        for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
+            inputLoadSchedule.append({"A": a, "B": b, "C": c})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+
+        return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py
new file mode 100644
index 0000000000..f25920f9d2
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py
@@ -0,0 +1,198 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMulTileConstraint.py
+#
+# Last edited: 28.04.2025
+#
+# Copyright (C) 2025, ETH Zurich and University of Bologna.
+#
+# Author: [Your Name]
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import int8_t, uint16_t, uint32_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel, PerformanceHint
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class RedmuleMatmulTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        # Add I/O dimensions to the model as variables
+        for _buffer in [bufferA, bufferB, outputBuffer]:
+            tilerModel.addTensorDimToModel(ctxt, _buffer.name)
+
+        tensorsShapeLen = len(bufferA.shape)
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transA'])
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
+        outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2))
+        outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1))
+
+        # Map output dims to inputs dims
+        for idx in range(tensorsShapeLen - 2):
+            tilerModel.addConstraint(
+                tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
+                    tensorName = bufferA.name, dimIdx = idx))
+            tilerModel.addConstraint(
+                tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
+                    tensorName = bufferB.name, dimIdx = idx))
+
+        tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar)
+        tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar)
+
+        # Add GEMM Geometrical constraints
+        tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+
+        tensorsShapeLen = len(bufferA.shape)
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transA'])
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
+
+        # VIC: We don't want to deal with intermediate results between kernel calls
+        tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
+        tilerModel.addConstraint(BFirstDimVar == parseDict['N'])
+
+
+        # Hardware-specific constraints for 4x12 accelerator
+        tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1))
+
+        M_full_size = ctxt.lookup(bufferA.name).shape[(tensorsShapeLen - 2) + parseDict['transA']]
+        if M_full_size >= 16:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "M",
+                                                      AFirstDimVar,
+                                                      16,
+                                                      strategy = PerformanceHint(priority = 1))
+        else:
+            tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy = PerformanceHint(1))
+
+        N_full_size = ctxt.lookup(bufferB.name).shape[(tensorsShapeLen - 2) + parseDict['transB']]
+        if N_full_size >= 12:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "O",
+                                                      BSecondDimVar,
+                                                      12,
+                                                      strategy = PerformanceHint(priority = 1))
+        else:
+            tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1))
+        
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['A', 'B', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varA = operatorRepresentation['A']
+
+        NSize = ctxt.lookup(varA).shape[-1]
+        NOffset = 0
+
+        inputACubes = []
+        inputBCubes = []
+
+        replacements = {"M": [], "O": [], "batch": []}
+
+        # Every output is constructed by a pair of inputs. Reconstruct this pair.
+        for cube in outputCubes:
+
+            BSize = 1
+            BOffset = 0
+            BatchSize = 1
+            BatchOffset = 0
+
+            if len(cube.offset) == 2:
+                (MOffset, OOffset) = cube.offset
+                (MSize, OSize) = cube.dims
+            elif len(cube.offset) == 3:
+                (BatchOffset, MOffset, OOffset) = cube.offset
+                (BatchSize, MSize, OSize) = cube.dims
+            else:
+                (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
+                (BatchSize, BSize, MSize, OSize) = cube.dims
+
+            replacements["M"].append(MSize)
+            replacements["O"].append(OSize)
+            replacements["batch"].append(BSize)
+
+            ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize))
+            BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize))
+
+            inputACubes.append(ACube)
+            inputBCubes.append(BCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        replacements["N"] = [NSize] * len(outputCubes)
+
+        replacementTypes = {
+            "M": PointerClass(int8_t),
+            "N": PointerClass(int8_t),
+            "O": PointerClass(int8_t),
+            "batch": PointerClass(int8_t)
+        }
+
+        for a, b in zip(inputACubes, inputBCubes):
+            inputLoadSchedule.append({"A": a, "B": b})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+
+        return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/Redmule/TileConstraints/__init__.py b/Deeploy/Targets/Redmule/TileConstraints/__init__.py
new file mode 100644
index 0000000000..a73187ca8f
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Redmule/Tiler.py b/Deeploy/Targets/Redmule/Tiler.py
new file mode 100644
index 0000000000..d131b42d4e
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Tiler.py
@@ -0,0 +1,37 @@
+# ----------------------------------------------------------------------
+#
+# File: Tiler.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and   
+# limitations under the License.
+
+from Deeploy.Targets.Redmule.Bindings import RedmuleMatmulBindings, RedmuleConv2DBindings, RedmuleGEMMBindings
+from Deeploy.Targets.Redmule.TileConstraints.MatmulTileConstraint import RedmuleMatmulTileConstraint
+from Deeploy.Targets.Redmule.TileConstraints.ConvTileConstraint import RedmuleConv2DTileConstraint
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+from Deeploy.Targets.Redmule.TileConstraints.GEMMTileConstraint import RedmuleGEMMTileConstraint
+
+RedmuleMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleMatmulBindings,
+                                                                tileConstraint = RedmuleMatmulTileConstraint())
+RedmuleConvTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleConv2DBindings,
+                                                                tileConstraint = RedmuleConv2DTileConstraint())
+RedmuleGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleGEMMBindings,
+                                                                tileConstraint = RedmuleGEMMTileConstraint())
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py
new file mode 100644
index 0000000000..31c4e17d05
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py
@@ -0,0 +1,176 @@
+# ----------------------------------------------------------------------
+#
+# File: RedMulePasses.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2025, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+import numpy as np
+import numpy.typing as npt
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match, NonBranchingMatcher
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import (
+    _permuteLastTwoDims,
+    _appendTransposeNode,
+)
+
+
+
+def _redmule_weight_layout_fun(graph: gs.Graph, match: Match, name: str):
+    """Convert Conv weights from [cout, h, w, cin] to [h,w,cin, cout] for RedMule accelerator"""
+    node = list(match.nodes_map.values())[0]
+    
+    weightTensor = node.inputs[1]
+    if isinstance(weightTensor, gs.Constant):
+        weightTensor.values = np.transpose(weightTensor.values, (1, 2, 3, 0))
+        
+    return graph
+
+
+@contextagnostic
+class RedMuleAdjustWeightMemoryLayoutPass(ReplaceSequentialPatternPass):
+    """Pass to convert Conv weights from [cout, h, w, cin] to [hwcin, cout] for RedMule accelerator"""
+    
+    def __init__(self, redmuleEngineName: str):
+        graph = gs.Graph()
+        _input = gs.Variable(name='input_1')
+        output = graph.layer(inputs=[_input], outputs=['convOut'], op='Conv', name='conv')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+        
+        super().__init__(
+            graph,
+            _redmule_weight_layout_fun, 
+            "_REDMULE_ADJUST_WEIGHT_MEMORY_LAYOUT_PASS")
+
+
+def _redmule_gemm_transpose_fun(graph: gs.Graph, match: Match, name: str):
+    """
+    Handle GEMM transA and transB attributes for RedMule accelerator
+    
+    Properly handles tensors of any dimensionality, ensuring only the last two
+    dimensions are transposed when needed.
+    """
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    gemm_node = matched_nodes[0]
+
+    if 'transA' not in gemm_node.attrs:
+        gemm_node.attrs['transA'] = 0
+    if 'transB' not in gemm_node.attrs:
+        gemm_node.attrs['transB'] = 0
+    if 'alpha' not in gemm_node.attrs:
+        gemm_node.attrs['alpha'] = 1.0
+    if 'beta' not in gemm_node.attrs:
+        gemm_node.attrs['beta'] = 1.0
+    
+    inputA = gemm_node.inputs[0]
+    inputB = gemm_node.inputs[1]
+    
+   
+    if gemm_node.attrs['transA'] != 0:
+        if isinstance(inputA, gs.Constant):
+            print(f"Physical transpose for constant A: {inputA.name}")
+            
+            if len(inputA.values.shape) > 2:
+                perm = list(range(len(inputA.values.shape)))
+                perm[-1], perm[-2] = perm[-2], perm[-1]
+                inputA.values = np.transpose(inputA.values, perm)
+            else:
+                inputA.values = np.transpose(inputA.values)
+                
+            gemm_node.attrs['transA'] = 0
+        else:
+        
+            perm = list(range(len(inputA.shape)))
+            perm[-1], perm[-2] = perm[-2], perm[-1]
+            
+          
+            anchorTransposeNode, anchorTransposeOutput = _appendTransposeNode(
+                inputA, 
+                name + "_A_Transpose",
+                perm  
+            )
+            gemm_node.inputs[0] = anchorTransposeOutput
+            gemm_node.attrs['transA'] = 0
+            graph.nodes.append(anchorTransposeNode)
+    
+
+    if gemm_node.attrs['transB'] != 0:
+        if isinstance(inputB, gs.Constant):
+ 
+            if len(inputB.values.shape) > 2:
+            
+                perm = list(range(len(inputB.values.shape)))
+                perm[-1], perm[-2] = perm[-2], perm[-1]
+                
+                inputB.values = np.transpose(inputB.values, perm)
+            else:
+                inputB.values = np.transpose(inputB.values)
+                
+            gemm_node.attrs['transB'] = 0
+        else: 
+            print(f"Adding transpose node for variable B: {inputB.name}")
+            
+            perm = list(range(len(inputB.shape)))
+            perm[-1], perm[-2] = perm[-2], perm[-1]
+            
+            anchorTransposeNode, anchorTransposeOutput = _appendTransposeNode(
+                inputB, 
+                name + "_B_Transpose",
+                perm  
+            )
+            gemm_node.inputs[1] = anchorTransposeOutput
+            gemm_node.attrs['transB'] = 0
+            graph.nodes.append(anchorTransposeNode)
+    
+    return graph
+
+
+@contextagnostic
+class RedMuleGEMMTransposePass(ReplaceSequentialPatternPass):
+    """Pass to handle GEMM transA and transB attributes for RedMule accelerator"""
+    
+    def __init__(self, redmuleEngineName: str):
+    
+        pattern = gs.Graph()
+        
+        input_a = gs.Variable(name="input_a")
+        input_b = gs.Variable(name="input_b")
+        
+        gemm_output = pattern.layer(
+            op="Gemm", 
+            name="gemm_node", 
+            inputs=[input_a, input_b], 
+            outputs=["gemm_output"]
+        )
+        
+ 
+        pattern.inputs = [input_a, input_b]
+        pattern.outputs = [gemm_output]
+
+        super().__init__(
+            pattern=pattern,
+            replacement_fn=_redmule_gemm_transpose_fun,
+            name="_REDMULE_GEMM_TRANSPOSE_PASS"
+        )
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py
new file mode 100644
index 0000000000..63063b6066
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt
index 888e42ae5a..2bc2a29d57 100644
--- a/DeeployTest/CMakeLists.txt
+++ b/DeeployTest/CMakeLists.txt
@@ -37,7 +37,7 @@ elseif(DEEPLOY_ARCH STREQUAL PULP)
     -Wno-pointer-sign
   )
 
-  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka)
+  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule)
     set(USE_NEUREKA ON)
     add_subdirectory(Platforms/Siracusa)
   elseif(platform STREQUAL PULPOpen)
diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytest.c b/DeeployTest/Platforms/Siracusa/src/deeploytest.c
index 5f04d78df3..643fb18928 100644
--- a/DeeployTest/Platforms/Siracusa/src/deeploytest.c
+++ b/DeeployTest/Platforms/Siracusa/src/deeploytest.c
@@ -68,7 +68,7 @@
        float diff = expected_val - actual_val;
        
   
-       if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff))
+       if ((diff < -2e-4) || (diff > 2e-4) || isnan(diff))
        {
          local_err_count += 1;
   
diff --git a/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/inputs.npz b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/inputs.npz
new file mode 100644
index 0000000000..d47edbbed8
Binary files /dev/null and b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/inputs.npz differ
diff --git a/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/network.onnx b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/network.onnx
new file mode 100644
index 0000000000..70010413cc
Binary files /dev/null and b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/network.onnx differ
diff --git a/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/outputs.npz b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/outputs.npz
new file mode 100644
index 0000000000..d756cdb275
Binary files /dev/null and b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/outputs.npz differ
diff --git a/DeeployTest/Tests/testFloat2dConvLarge/inputs.npz b/DeeployTest/Tests/testFloat2dConvLarge/inputs.npz
new file mode 100644
index 0000000000..5d15b68696
Binary files /dev/null and b/DeeployTest/Tests/testFloat2dConvLarge/inputs.npz differ
diff --git a/DeeployTest/Tests/testFloat2dConvLarge/network.onnx b/DeeployTest/Tests/testFloat2dConvLarge/network.onnx
new file mode 100644
index 0000000000..dd710f7ec0
Binary files /dev/null and b/DeeployTest/Tests/testFloat2dConvLarge/network.onnx differ
diff --git a/DeeployTest/Tests/testFloat2dConvLarge/outputs.npz b/DeeployTest/Tests/testFloat2dConvLarge/outputs.npz
new file mode 100644
index 0000000000..f1cf1fa8e9
Binary files /dev/null and b/DeeployTest/Tests/testFloat2dConvLarge/outputs.npz differ
diff --git a/DeeployTest/Tests/testFloatMatmulLarge/inputs.npz b/DeeployTest/Tests/testFloatMatmulLarge/inputs.npz
new file mode 100644
index 0000000000..06fe42968b
Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge/inputs.npz differ
diff --git a/DeeployTest/Tests/testFloatMatmulLarge/network.onnx b/DeeployTest/Tests/testFloatMatmulLarge/network.onnx
new file mode 100644
index 0000000000..d91cbeeacc
Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge/network.onnx differ
diff --git a/DeeployTest/Tests/testFloatMatmulLarge/outputs.npz b/DeeployTest/Tests/testFloatMatmulLarge/outputs.npz
new file mode 100644
index 0000000000..edd6182cd9
Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge/outputs.npz differ
diff --git a/DeeployTest/Tests/testFloatMatmulLarge256/inputs.npz b/DeeployTest/Tests/testFloatMatmulLarge256/inputs.npz
new file mode 100644
index 0000000000..43eb2325f9
Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge256/inputs.npz differ
diff --git a/DeeployTest/Tests/testFloatMatmulLarge256/network.onnx b/DeeployTest/Tests/testFloatMatmulLarge256/network.onnx
new file mode 100644
index 0000000000..cd3d3c1474
Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge256/network.onnx differ
diff --git a/DeeployTest/Tests/testFloatMatmulLarge256/outputs.npz b/DeeployTest/Tests/testFloatMatmulLarge256/outputs.npz
new file mode 100644
index 0000000000..fe0be0bad6
Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge256/outputs.npz differ
diff --git a/DeeployTest/testRunner_tiled_siracusa_w_redmule.py b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py
new file mode 100644
index 0000000000..015ca62085
--- /dev/null
+++ b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py
@@ -0,0 +1,50 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner_tiled_siracusa_w_neureka.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+
+    parser = TestRunnerArgumentParser(
+        tiling_arguments = True,
+        description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling & Redmule).")
+
+    parser.add_argument('--cores',
+                        metavar = '<cores>',
+                        dest = 'cores',
+                        type = int,
+                        default = 1,
+                        help = 'Set number of cluster cores')
+    args = parser.parse_args()
+
+    testRunner = TestRunner(platform = "Siracusa_w_redmule",
+                            simulator = "gvsoc",
+                            tiling = True,
+                            argument_parser = parser)
+
+    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
+
+
+    testRunner.run()
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 4e24995d78..4b7f845ce1 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -39,13 +39,15 @@
 from Deeploy.Targets.Neureka.Deployer import NeurekaDeployer
 from Deeploy.Targets.Neureka.Platform import MemoryNeurekaPlatform, MemoryNeurekaPlatformWrapper, NeurekaOptimizer, \
     NeurekaPlatform
+from Deeploy.Targets.Redmule.Deployer import RedmuleDeployer
+from Deeploy.Targets.Redmule.Platform import RedmuleOptimizer, RedmulePlatform
 from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
 from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform
 from Deeploy.Targets.Snitch.Deployer import SnitchDeployer
 from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform
 
 _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool"]
-_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch"]
+_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Siracusa_w_redmule"]
 _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS
 
 
@@ -77,6 +79,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
 
     elif platformName == "Siracusa_w_neureka":
         Platform = NeurekaPlatform()
+    
+    elif platformName == "Siracusa_w_redmule":
+        Platform = RedmulePlatform()
 
     elif platformName == "Snitch":
         Platform = SnitchPlatform()
@@ -89,7 +94,7 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
 
 def setupMemoryPlatform(platform: DeploymentPlatform, memoryHierarchy: MemoryHierarchy,
                         defaultTargetMemoryLevel: MemoryLevel) -> Union[MemoryPlatform, MemoryPlatformWrapper]:
-    if isinstance(platform, PULPPlatform):
+    if isinstance(platform, (PULPPlatform, RedmulePlatform)):
         return MemoryPULPPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel)
     elif isinstance(platform, NeurekaPlatform):
         weightMemoryLevel = memoryHierarchy.memoryLevels["WeightMemory_SRAM"] \
@@ -191,6 +196,22 @@ def mapDeployer(platform: DeploymentPlatform,
                                    name = name,
                                    default_channels_first = default_channels_first,
                                    deeployStateDir = deeployStateDir)
+        
+    elif isinstance(platform, (RedmulePlatform)):
+        if loweringOptimizer is None:
+            loweringOptimizer = RedmuleOptimizer
+
+        if default_channels_first is None:
+            default_channels_first = False
+
+        deployer = RedmuleDeployer(graph,
+                                   platform,
+                                   inputTypes,
+                                   loweringOptimizer,
+                                   scheduler,
+                                   name = name,
+                                   default_channels_first = default_channels_first,
+                                   deeployStateDir = deeployStateDir)
 
     elif isinstance(platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)):
 
diff --git a/Makefile b/Makefile
index 806daa274f..35e0d7febb 100644
--- a/Makefile
+++ b/Makefile
@@ -44,6 +44,7 @@ PICOLIBC_RV32IM_INSTALL_DIR      ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32im
 PICOLIBC_RV32IMC_INSTALL_DIR      ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32imc
 PICOLIBC_RV32IMA_INSTALL_DIR      ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32ima
 PICOLIBC_RV32IMAFD_INSTALL_DIR      ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32imafd
+PICOLIBC_RV32IMF_INSTALL_DIR      ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32imf
 
 PULP_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/pulp-sdk
 QEMU_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/qemu
@@ -64,7 +65,7 @@ PULP_SDK_COMMIT_HASH ?= 3e1e569bd789a11d9dde6d6b3930849505e68b4a
 BANSHEE_COMMIT_HASH ?= 0e105921e77796e83d01c2aa4f4cadfa2005b4d9
 MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6
 SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2
-GVSOC_COMMIT_HASH ?= eeb7ef8c1dfcb944ac80d797a8cea35aacc14ac5
+GVSOC_COMMIT_HASH ?= 35d00d15d7249daaac0de61bd8485fba128e5959
 MINIMALLOC_COMMMIT_HASH ?= e9eaf54094025e1c246f9ec231b905f8ef42a29d
 XTL_VERSION ?= 0.7.5
 XSIMD_VERSION ?= 13.2.0
@@ -352,7 +353,18 @@ ${PICOLIBC_RV32IMAFD_INSTALL_DIR}: ${TOOLCHAIN_DIR}/picolibc
 	--cross-file ../scripts/meson-build-script-rv32imafd.txt && \
 	PATH=${LLVM_INSTALL_DIR}/bin:${PATH} meson install
 
-picolibc-riscv: ${PICOLIBC_RV32IM_INSTALL_DIR} ${PICOLIBC_RV32IMA_INSTALL_DIR} ${PICOLIBC_RV32IMC_INSTALL_DIR} ${PICOLIBC_RV32IMAFD_INSTALL_DIR}
+${PICOLIBC_RV32IMF_INSTALL_DIR}: ${TOOLCHAIN_DIR}/picolibc
+	cd ${TOOLCHAIN_DIR}/picolibc && mkdir -p build-rv32imf && cd build-rv32imf && \
+	cp ${TOOLCHAIN_DIR}/meson-build-script-rv32imf.txt ../scripts && \
+	PATH=${LLVM_INSTALL_DIR}/bin:${PATH} meson setup --reconfigure -Dincludedir=include \
+	-Dlibdir=lib \
+	-Dspecsdir=none \
+	-Dmultilib=false \
+	--prefix ${PICOLIBC_RV32IMF_INSTALL_DIR} \
+	--cross-file ../scripts/meson-build-script-rv32imf.txt && \
+	PATH=${LLVM_INSTALL_DIR}/bin:${PATH} meson install
+
+picolibc-riscv: ${PICOLIBC_RV32IM_INSTALL_DIR} ${PICOLIBC_RV32IMA_INSTALL_DIR} ${PICOLIBC_RV32IMC_INSTALL_DIR} ${PICOLIBC_RV32IMAFD_INSTALL_DIR} ${PICOLIBC_RV32IMF_INSTALL_DIR}
 
 ${TOOLCHAIN_DIR}/pulp-sdk:
 	cd ${TOOLCHAIN_DIR} && \
@@ -390,7 +402,7 @@ snitch_runtime: ${SNITCH_INSTALL_DIR}
 
 ${TOOLCHAIN_DIR}/gvsoc:
 	cd ${TOOLCHAIN_DIR} && \
-	git clone https://github.com/gvsoc/gvsoc.git && \
+	git clone https://github.com/runwangdl/gvsoc.git && \
 	cd ${TOOLCHAIN_DIR}/gvsoc && git checkout ${GVSOC_COMMIT_HASH} && \
 	git submodule update --init --recursive && \
 	pip install -r core/requirements.txt && pip install -r gapy/requirements.txt
diff --git a/TargetLibraries/Generic/inc/kernel/GELU.h b/TargetLibraries/Generic/inc/kernel/GELU.h
index 0c6d19d6c7..0825a11e0e 100644
--- a/TargetLibraries/Generic/inc/kernel/GELU.h
+++ b/TargetLibraries/Generic/inc/kernel/GELU.h
@@ -46,4 +46,9 @@ void GELU_s8_s32(int8_t *data_in, int32_t *data_out, int32_t dataSize, int8_t b,
 
 void GELU_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t dataSize);
 
+void GELU_fp32_fp32_sigmoid(float32_t *data_in, float32_t *data_out, int32_t dataSize);
+
+void GELU_fp32_fp32_sigmoid_chunk(float32_t *data_in, float32_t *data_out,
+                                  int32_t start_idx, int32_t end_idx);
+
 #endif //__DEEPLOY_BASIC_MATH_GELU_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/MatMul.h b/TargetLibraries/Generic/inc/kernel/MatMul.h
index d9b35eb1a7..d646950683 100644
--- a/TargetLibraries/Generic/inc/kernel/MatMul.h
+++ b/TargetLibraries/Generic/inc/kernel/MatMul.h
@@ -62,10 +62,20 @@ void MatMul_s8_s8_s32(int8_t const *__restrict__ pSrcA,
                       uint32_t P, int32_t A_offset, int32_t B_offset,
                       int32_t C_offset);
 
+/******************************************************************************/
+/*                         Matrix Multiplication (Float32)                    */
+/******************************************************************************/
 void MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
-                               const float32_t *__restrict__ pSrcB,
-                               float32_t *__restrict__ pDstY,
-                               uint32_t M,
-                               uint32_t N, 
-                               uint32_t O);
+                           const float32_t *__restrict__ pSrcB,
+                           float32_t *__restrict__ pDstY,
+                           uint32_t M,
+                           uint32_t N,
+                           uint32_t O);
+
+void MatMul_fp32_fp32_fp32_unroll1x7(const float32_t *__restrict__ pSrcA,
+                                     const float32_t *__restrict__ pSrcB,
+                                     float32_t *__restrict__ pDstY,
+                                     uint32_t M,
+                                     uint32_t N,
+                                     uint32_t O);
 #endif //__DEEPLOY_BASIC_MATH_MATMUL_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/src/GELU_fp32.c b/TargetLibraries/Generic/src/GELU_fp32.c
index 923dcf9c65..18e5e0a41a 100644
--- a/TargetLibraries/Generic/src/GELU_fp32.c
+++ b/TargetLibraries/Generic/src/GELU_fp32.c
@@ -36,3 +36,28 @@ void GELU_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t dataSize) {
         data_out[i] = x * cdf;
     }
 }
+
+void GELU_fp32_fp32_sigmoid(float32_t *data_in, float32_t *data_out, int32_t dataSize) {
+
+    const float32_t scale = 1.702f;
+    for (int i = 0; i < dataSize; i++) {
+        float32_t x = data_in[i];
+        float32_t sigmoid_in = scale * x;
+        // sigmoid(z) = 1 / (1 + exp(-z))
+        float32_t sigmoid = 1.0f / (1.0f + expf(-sigmoid_in));
+        data_out[i] = x * sigmoid;
+    }
+}
+
+void GELU_fp32_fp32_sigmoid_chunk(float32_t *data_in, float32_t *data_out, int32_t start_idx, int32_t end_idx)
+{
+    const float32_t scale = 1.702f;
+    for (uint32_t i = start_idx; i < end_idx; i++)
+    {
+        float32_t x = data_in[i];
+        float32_t sigmoid_in = scale * x;
+        // sigmoid(z) = 1 / (1 + exp(-z))
+        float32_t sigmoid = 1.0f / (1.0f + expf(-sigmoid_in));
+        data_out[i] = x * sigmoid;
+    }
+}
\ No newline at end of file
diff --git a/TargetLibraries/Generic/src/MatMul_fp32.c b/TargetLibraries/Generic/src/MatMul_fp32.c
index 1d704b8517..3ef26a6054 100644
--- a/TargetLibraries/Generic/src/MatMul_fp32.c
+++ b/TargetLibraries/Generic/src/MatMul_fp32.c
@@ -29,20 +29,93 @@
 #include "DeeployBasicMath.h"
 
 void MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA,
-                               const float32_t *__restrict__ pSrcB,
-                               float32_t *__restrict__ pDstY,
-                               uint32_t M,
-                               uint32_t N, 
-                               uint32_t O) {
-
- 
-  for (uint32_t i = 0; i < M; ++i) {
-    for (uint32_t j = 0; j < O; ++j) {
+                           const float32_t *__restrict__ pSrcB,
+                           float32_t *__restrict__ pDstY,
+                           uint32_t M,
+                           uint32_t N,
+                           uint32_t O)
+{
+
+  for (uint32_t i = 0; i < M; ++i)
+  {
+    for (uint32_t j = 0; j < O; ++j)
+    {
       float32_t sum = 0.0f;
-      for (uint32_t k = 0; k < N; ++k) {
+      for (uint32_t k = 0; k < N; ++k)
+      {
         sum += pSrcA[i * N + k] * pSrcB[k * O + j];
       }
       pDstY[i * O + j] = sum;
     }
   }
-}
\ No newline at end of file
+}
+
+void MatMul_fp32_fp32_fp32_unroll1x7(const float32_t *__restrict__ pSrcA,
+                                     const float32_t *__restrict__ pSrcB,
+                                     float32_t *__restrict__ pDstY,
+                                     uint32_t M,
+                                     uint32_t N,
+                                     uint32_t O)
+{
+  uint32_t i, j, k;
+  uint32_t O_block = O - (O % 7);
+
+  for (i = 0; i < M; i++)
+  {
+    for (j = 0; j < O_block; j += 7)
+    {
+      float32_t sum0 = 0.0f;
+      float32_t sum1 = 0.0f;
+      float32_t sum2 = 0.0f;
+      float32_t sum3 = 0.0f;
+      float32_t sum4 = 0.0f;
+      float32_t sum5 = 0.0f;
+      float32_t sum6 = 0.0f;
+
+      for (k = 0; k < N; k++)
+      {
+        float32_t a0 = pSrcA[i * N + k];
+
+        float32_t b0 = pSrcB[k * O + (j + 0)];
+        float32_t b1 = pSrcB[k * O + (j + 1)];
+        float32_t b2 = pSrcB[k * O + (j + 2)];
+        float32_t b3 = pSrcB[k * O + (j + 3)];
+        float32_t b4 = pSrcB[k * O + (j + 4)];
+        float32_t b5 = pSrcB[k * O + (j + 5)];
+        float32_t b6 = pSrcB[k * O + (j + 6)];
+
+        sum0 += a0 * b0;
+        sum1 += a0 * b1;
+        sum2 += a0 * b2;
+        sum3 += a0 * b3;
+        sum4 += a0 * b4;
+        sum5 += a0 * b5;
+        sum6 += a0 * b6;
+      }
+
+      pDstY[i * O + (j + 0)] = sum0;
+      pDstY[i * O + (j + 1)] = sum1;
+      pDstY[i * O + (j + 2)] = sum2;
+      pDstY[i * O + (j + 3)] = sum3;
+      pDstY[i * O + (j + 4)] = sum4;
+      pDstY[i * O + (j + 5)] = sum5;
+      pDstY[i * O + (j + 6)] = sum6;
+    }
+
+    for (j = O_block; j < O; j++)
+    {
+      float32_t sum = 0.0f;
+
+      for (k = 0; k < N; k++)
+      {
+        float32_t a_val = pSrcA[i * N + k];
+        float32_t b_val = pSrcB[k * O + j];
+        float32_t prod = a_val * b_val;
+        sum += prod;
+      }
+
+      pDstY[i * O + j] = sum;
+    }
+  }
+}
+
diff --git a/TargetLibraries/Generic/src/Softmax_fp32.c b/TargetLibraries/Generic/src/Softmax_fp32.c
index 5553f1e302..e9082c3333 100644
--- a/TargetLibraries/Generic/src/Softmax_fp32.c
+++ b/TargetLibraries/Generic/src/Softmax_fp32.c
@@ -41,12 +41,14 @@ void Softmax_fp32_fp32(float32_t* input, float32_t* output, int32_t size, int32_
         }
 
         for (int i = 0; i < last_dim_length; i++) {
-            output[b * last_dim_length + i] = expf(input[b * last_dim_length + i] - max_val);
+            float32_t exp_val = input[b * last_dim_length + i] - max_val;
+            output[b * last_dim_length + i] = expf(exp_val);
             sum += output[b * last_dim_length + i];
         }
 
         for (int i = 0; i < last_dim_length; i++) {
-            output[b * last_dim_length + i] /= sum;
+            float32_t sum_1 =  1/ sum;
+            output[b * last_dim_length + i] =  output[b * last_dim_length + i] * sum_1;
         }
     }
 }
diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt
index d951c6bde4..7508962fb1 100644
--- a/TargetLibraries/PULPOpen/CMakeLists.txt
+++ b/TargetLibraries/PULPOpen/CMakeLists.txt
@@ -2,7 +2,7 @@ file(GLOB_RECURSE SOURCES
   "src/**"
 )
 
-if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")
+if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka" OR platform STREQUAL "Siracusa_w_redmule")
   include(cmake/pulp-sdk-siracusa.cmake)
 elseif(platform STREQUAL "PULPOpen")
   include(cmake/pulp-sdk-pulp-open.cmake)
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
new file mode 100644
index 0000000000..51c5097744
--- /dev/null
+++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
@@ -0,0 +1,75 @@
+
+/* =====================================================================
+ * Title:        Conv.h
+ * Description:
+ *
+ * $Date:       05.04.2025
+ *
+ * ===================================================================== */
+/*
+ * Copyright (C) 2020 ETH Zurich and University of Bologna.
+ *
+ * Author: Run Wang, ETH Zurich
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ #include "DeeployPULPMath.h"
+
+ void Conv2d_ChannelRange_fp32_fp32_fp32_HWC(
+     const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+     const float32_t *__restrict__ pSrcB, uint32_t F_subset,
+     uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ,
+     float32_t *__restrict__ pDstC, uint32_t F_total, uint32_t F_start,
+     uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right);
+ 
+ 
+ void Conv2d_Im2Col_ChannelRange_fp32_fp32_fp32_HWC(
+     const float32_t *__restrict__ pSrcA,
+     uint32_t H,
+     uint32_t W,
+     uint32_t C,
+     const float32_t *__restrict__ pSrcB,
+     uint32_t F_subset,
+     uint32_t P,
+     uint32_t Q,
+     uint32_t SP,
+     uint32_t SQ,
+     float32_t *__restrict__ pDstC,
+     uint32_t F_total,
+     uint32_t F_start,
+     uint32_t pad_top,
+     uint32_t pad_bottom,
+     uint32_t pad_left,
+     uint32_t pad_right,
+     float32_t *__restrict__ pIm2ColBuffer);
+
+     void Conv2d_Im2Col_fp32_fp32_fp32_HWC_8_Redmule(
+        const float32_t *__restrict__ pSrcA,
+        uint32_t H,
+        uint32_t W,
+        uint32_t C,
+        const float32_t *__restrict__ pSrcB,
+        uint32_t P,
+        uint32_t Q,
+        uint32_t SP,
+        uint32_t SQ,
+        float32_t *__restrict__ pDstC,
+        uint32_t F,
+        uint32_t pad_top,
+        uint32_t pad_bottom,
+        uint32_t pad_left,
+        uint32_t pad_right,
+        float32_t *__restrict__ pIm2ColBuffer);
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/gelu.h b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h
similarity index 54%
rename from TargetLibraries/PULPOpen/inc/kernel/gelu.h
rename to TargetLibraries/PULPOpen/inc/kernel/MaxPool.h
index 390e7f9926..6d9dd6ea86 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/gelu.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h
@@ -1,15 +1,14 @@
-
 /* =====================================================================
- * Title:        gelu.h
+ * Title:        Maxpool.h
  * Description:
  *
- * $Date:        28.01.2025
+ * $Date:       05.04.2025
  *
  * ===================================================================== */
 /*
  * Copyright (C) 2020 ETH Zurich and University of Bologna.
  *
- * Author: Moritz Scherer, ETH Zurich
+ * Author: Run Wang, ETH Zurich
  *
  * SPDX-License-Identifier: Apache-2.0
  *
@@ -28,4 +27,18 @@
 
 #include "DeeployPULPMath.h"
 
-void GELU_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t dataSize);
\ No newline at end of file
+
+void MaxPool2d_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
+    uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ,
+    float32_t *__restrict__ pDstC,
+    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right);
+
+
+void MaxPool2d_ChannelRange_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, 
+    uint32_t W, uint32_t H, uint32_t C,
+    uint32_t Q, uint32_t P, uint32_t SQ, uint32_t SP,
+    float32_t *__restrict__ pDstC,
+    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right,
+    uint32_t ch_start, uint32_t ch_count);
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/inc/kernel/gemv.h b/TargetLibraries/PULPOpen/inc/kernel/gemv.h
index 214f8300ad..057e4a7a4d 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/gemv.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/gemv.h
@@ -26,9 +26,19 @@
  */
 
 #include "stdint.h"
+#include "DeeployPULPMath.h"
 
 void gemv_s8_s8_plp(int8_t *pIn, int8_t *pBias, int8_t *pOut, int8_t *pWeight,
                     int32_t *pKappa, int32_t *pLambda, uint16_t out_mult,
                     uint16_t out_shift, uint16_t dim_vec,
                     uint16_t num_o_neurons, uint8_t flag_relu,
                     uint8_t flag_batch_norm);
+
+void Gemm_fp32_fp32_fp32_fp32_Redmule(
+    const float32_t *__restrict__ pSrcA,
+    const float32_t *__restrict__ pSrcB,
+    const float32_t *__restrict__ pBias,
+    float32_t *__restrict__ pDstY,
+    uint32_t M,
+    uint32_t N,
+    uint32_t O);
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/src/Convolution_fp32.c b/TargetLibraries/PULPOpen/src/Convolution_fp32.c
index ab804b43a9..88a8706f49 100644
--- a/TargetLibraries/PULPOpen/src/Convolution_fp32.c
+++ b/TargetLibraries/PULPOpen/src/Convolution_fp32.c
@@ -30,11 +30,11 @@
 #include "DeeployPULPMath.h"
 #include "pmsis.h"
 
-void Conv2d_fp32_fp32_fp32_HWC(
+void Conv2d_ChannelRange_fp32_fp32_fp32_HWC(
     const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
-    const float32_t *__restrict__ pSrcB, uint32_t F,
+    const float32_t *__restrict__ pSrcB, uint32_t F_subset,
     uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ,
-    float32_t *__restrict__ pDstC,
+    float32_t *__restrict__ pDstC, uint32_t F_total, uint32_t F_start,
     uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right) {
 
 
@@ -48,7 +48,7 @@ void Conv2d_fp32_fp32_fp32_HWC(
     for (h = 0; h < H_out; ++h) {
         for (w = 0; w < W_out; ++w) {
             
-            for (f = 0; f < F; ++f) {
+            for (f = 0; f < F_subset; ++f) {
                 float32_t sum = 0.0f;
                 
                 for (p = 0; p < P; ++p) {
@@ -73,9 +73,182 @@ void Conv2d_fp32_fp32_fp32_HWC(
                     }
                 }
 
-                uint32_t output_idx = (h * W_out + w) * F + f;
+                uint32_t output_idx = (h * W_out + w) * F_total + (F_start + f);
                 pDstC[output_idx] = sum;
             }
         }
     }
+}
+
+void Conv2d_Im2Col_ChannelRange_fp32_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA,
+    uint32_t H,
+    uint32_t W,
+    uint32_t C,
+    const float32_t *__restrict__ pSrcB,
+    uint32_t F_subset,
+    uint32_t P,
+    uint32_t Q,
+    uint32_t SP,
+    uint32_t SQ,
+    float32_t *__restrict__ pDstC,
+    uint32_t F_total,
+    uint32_t F_start,
+    uint32_t pad_top,
+    uint32_t pad_bottom,
+    uint32_t pad_left,
+    uint32_t pad_right,
+    float32_t *__restrict__ pIm2ColBuffer)
+{
+
+    uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
+    uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
+
+    uint32_t kernel_size = P * Q * C;
+
+    for (uint32_t h_out = 0; h_out < H_out; h_out++)
+    {
+        for (uint32_t w_out = 0; w_out < W_out; w_out++)
+        {
+
+            int32_t h_in_start = h_out * SP - pad_top;
+            int32_t w_in_start = w_out * SQ - pad_left;
+
+            float32_t *pIm2Col = pIm2ColBuffer;
+
+            for (uint32_t p = 0; p < P; p++)
+            {
+                int32_t h_in = h_in_start + p;
+
+                for (uint32_t q = 0; q < Q; q++)
+                {
+                    int32_t w_in = w_in_start + q;
+
+                    for (uint32_t c = 0; c < C; c++)
+                    {
+                        if (h_in >= 0 && h_in < H && w_in >= 0 && w_in < W)
+                        {
+
+                            uint32_t in_idx = (h_in * W + w_in) * C + c;
+                            pIm2Col[p * Q * C + q * C + c] = pSrcA[in_idx];
+                        }
+                        else
+                        {
+                            pIm2Col[p * Q * C + q * C + c] = 0.0f;
+                        }
+                    }
+                }
+            }
+
+            for (uint32_t f = 0; f < F_subset; f++)
+            {
+                float32_t sum = 0.0f;
+
+                const float32_t *weight_ptr = pSrcB + f * kernel_size;
+
+                for (uint32_t k = 0; k < kernel_size; k++)
+                {
+                    sum += pIm2Col[k] * weight_ptr[k];
+                }
+
+                uint32_t out_idx = (h_out * W_out + w_out) * F_total + (F_start + f);
+                pDstC[out_idx] = sum;
+            }
+        }
+    }
+}
+
+void Conv2d_Im2Col_fp32_fp32_fp32_HWC_8_Redmule(
+    const float32_t *__restrict__ pSrcA,
+    uint32_t H,
+    uint32_t W,
+    uint32_t C,
+    const float32_t *__restrict__ pSrcB,
+    uint32_t P,
+    uint32_t Q,
+    uint32_t SP,
+    uint32_t SQ,
+    float32_t *__restrict__ pDstC,
+    uint32_t F,
+    uint32_t pad_top,
+    uint32_t pad_bottom,
+    uint32_t pad_left,
+    uint32_t pad_right,
+    float32_t *__restrict__ pIm2ColBuffer) {
+    
+    uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
+    uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
+    uint32_t kernel_size = P * Q * C;
+    uint32_t core_id = pi_core_id();
+    uint32_t num_cores = NUM_CORES;
+    
+    uint32_t total_positions = H_out * W_out;
+    uint32_t num_batches = (total_positions + num_cores - 1) / num_cores;
+
+    float32_t *core_im2col_buffer = pIm2ColBuffer + core_id * kernel_size;
+    
+    for (uint32_t batch = 0; batch < num_batches; batch++) {
+
+        uint32_t batch_start_pos = batch * num_cores;
+        
+
+        uint32_t valid_cores = MIN(num_cores, total_positions - batch_start_pos);
+        
+
+        if (core_id < valid_cores) {
+
+            uint32_t pos = batch_start_pos + core_id;
+            
+
+            uint32_t h_out = pos / W_out;
+            uint32_t w_out = pos % W_out;
+            int32_t h_in_start = h_out * SP - pad_top;
+            int32_t w_in_start = w_out * SQ - pad_left;
+            
+
+            for (uint32_t p = 0; p < P; p++) {
+                int32_t h_in = h_in_start + p;
+                
+                for (uint32_t q = 0; q < Q; q++) {
+                    int32_t w_in = w_in_start + q;
+                    uint32_t in_offset = (h_in * W + w_in) * C;
+                    uint32_t kernel_offset = (p * Q + q) * C;
+                    
+                    if (h_in >= 0 && h_in < H && w_in >= 0 && w_in < W) {
+                        
+                        for (uint32_t c = 0; c < C; c++) {
+                            core_im2col_buffer[kernel_offset + c] = pSrcA[in_offset + c];
+                        }
+                    }
+                    else {
+                        
+                        for (uint32_t c = 0; c < C; c++) {
+                            core_im2col_buffer[kernel_offset + c] = 0.0f;
+                        }
+                    }
+
+                }
+            }
+        }
+        
+
+        pi_cl_team_barrier();
+        
+
+        if (core_id == 0) {
+
+            float32_t *batch_output = pDstC + batch_start_pos * F;
+
+            MatMul_fp32_fp32_fp32_Redmule(
+                pIm2ColBuffer,     
+                pSrcB,             
+                batch_output,      
+                valid_cores,       
+                kernel_size,       
+                F                  
+            );
+        }
+        
+        pi_cl_team_barrier();
+    }
 }
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c
new file mode 100644
index 0000000000..e042cea132
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c
@@ -0,0 +1,139 @@
+#include "DeeployBasicMath.h"
+
+
+#define REDMULE_BASE_ADDR 0x10201C00 
+
+#define REG_MNK_M         0x00
+#define REG_MNK_N         0x04
+#define REG_MNK_K         0x08
+#define REG_X_ADDR        0x0C
+#define REG_Y_ADDR        0x10
+#define REG_Z_ADDR        0x14
+#define REG_W_ADDR        0x18
+#define REG_COMPUTE_MODE  0x1C
+#define REG_TRIGGER       0x20
+#define REG_WAIT          0x28
+
+void MatMul_fp32_fp32_fp32_Redmule(
+    const float32_t *__restrict__ pSrcA,
+    const float32_t *__restrict__ pSrcB,
+    float32_t *__restrict__ pDstY,
+    uint32_t M,
+    uint32_t N,
+    uint32_t O) {
+    
+    uint32_t total_elements = M * O;
+    for (uint32_t i = 0; i < total_elements; i++) {
+        pDstY[i] = 0.0f;
+    }
+    
+    volatile uint16_t *mnk_m = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M);
+    volatile uint16_t *mnk_n = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N);
+    volatile uint16_t *mnk_k = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K);
+    
+    *mnk_m = (uint16_t)M;
+    *mnk_n = (uint16_t)N;
+    *mnk_k = (uint16_t)O;
+    
+    volatile uint32_t *x_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR);
+    volatile uint32_t *y_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR);
+    volatile uint32_t *z_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR);
+    volatile uint32_t *w_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR);
+    
+    *x_addr = (uint32_t)((uintptr_t)pSrcA);
+    *y_addr = (uint32_t)((uintptr_t)pDstY);
+    *z_addr = (uint32_t)((uintptr_t)pDstY);
+    *w_addr = (uint32_t)((uintptr_t)pSrcB);
+    
+    volatile uint32_t *compute_mode = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE);
+    *compute_mode = 4;  // FP32 mode
+    
+    volatile uint32_t *trigger = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER);
+    *trigger;  
+    
+    volatile uint32_t *wait_reg = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT);
+    uint32_t result = *wait_reg;  
+}
+
+void MatMul_fp32_fp32_fp32_Redmule_Async(
+    const float32_t *__restrict__ pSrcA,
+    const float32_t *__restrict__ pSrcB,
+    float32_t *__restrict__ pDstY,
+    uint32_t M,
+    uint32_t N,
+    uint32_t O) {
+    
+    uint32_t total_elements = M * O;
+    for (uint32_t i = 0; i < total_elements; i++) {
+        pDstY[i] = 0.0f;
+    }
+    
+
+    volatile uint16_t *mnk_m = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M);
+    volatile uint16_t *mnk_n = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N);
+    volatile uint16_t *mnk_k = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K);
+    
+    *mnk_m = (uint16_t)M;
+    *mnk_n = (uint16_t)N;
+    *mnk_k = (uint16_t)O;
+    
+    volatile uint32_t *x_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR);
+    volatile uint32_t *y_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR);
+    volatile uint32_t *z_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR);
+    volatile uint32_t *w_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR);
+    
+    *x_addr = (uint32_t)((uintptr_t)pSrcA);
+    *y_addr = (uint32_t)((uintptr_t)pDstY);
+    *z_addr = (uint32_t)((uintptr_t)pDstY);
+    *w_addr = (uint32_t)((uintptr_t)pSrcB);
+    
+    volatile uint32_t *compute_mode = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE);
+    *compute_mode = 4;  // FP32 mode
+    
+    volatile uint32_t *trigger = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER);
+    *trigger;  // Trigger without waiting
+}
+
+uint32_t MatMul_fp32_fp32_fp32_Redmule_Wait() {
+    volatile uint32_t *wait_reg = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT);
+    return *wait_reg;
+}
+
+void Gemm_fp32_fp32_fp32_fp32_Redmule(
+    const float32_t *__restrict__ pSrcA,
+    const float32_t *__restrict__ pSrcB,
+    const float32_t *__restrict__ pBias,
+    float32_t *__restrict__ pDstY,
+    uint32_t M,
+    uint32_t N,
+    uint32_t O) {
+
+    
+    volatile uint16_t *mnk_m = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M);
+    volatile uint16_t *mnk_n = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N);
+    volatile uint16_t *mnk_k = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K);
+
+    *mnk_m = (uint16_t)M;
+    *mnk_n = (uint16_t)N;
+    *mnk_k = (uint16_t)O;
+
+
+    volatile uint32_t *x_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR);
+    volatile uint32_t *y_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR);
+    volatile uint32_t *z_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR);
+    volatile uint32_t *w_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR);
+
+    *x_addr = (uint32_t)((uintptr_t)pSrcA);
+    *y_addr = (uint32_t)((uintptr_t)pBias);  
+    *z_addr = (uint32_t)((uintptr_t)pDstY);
+    *w_addr = (uint32_t)((uintptr_t)pSrcB);
+
+    volatile uint32_t *compute_mode = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE);
+    *compute_mode = 4;  // FP32 mode
+    
+    volatile uint32_t *trigger = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER);
+    *trigger;  
+    
+    volatile uint32_t *wait_reg = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT);
+    uint32_t result = *wait_reg;  
+}
diff --git a/TargetLibraries/PULPOpen/src/MaxPool_fp32.c b/TargetLibraries/PULPOpen/src/MaxPool_fp32.c
index fe8afcde12..05f86674b0 100644
--- a/TargetLibraries/PULPOpen/src/MaxPool_fp32.c
+++ b/TargetLibraries/PULPOpen/src/MaxPool_fp32.c
@@ -72,4 +72,61 @@ void MaxPool2d_fp32_fp32_HWC(
            }
        }
    }
+}
+
+
+void MaxPool2d_ChannelRange_fp32_fp32_HWC(
+    const float32_t *__restrict__ pSrcA, 
+    uint32_t W, uint32_t H, uint32_t C,
+    uint32_t Q, uint32_t P, uint32_t SQ, uint32_t SP,
+    float32_t *__restrict__ pDstC,
+    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right,
+    uint32_t ch_start, uint32_t ch_count) {
+    
+    uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
+    uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
+
+    uint32_t ch_end = ch_start + ch_count;
+
+    if (ch_end > C) {
+        ch_end = C;
+    }
+
+    for (uint32_t h_out = 0; h_out < H_out; ++h_out) {
+        for (uint32_t w_out = 0; w_out < W_out; ++w_out) {
+       
+            for (uint32_t c = ch_start; c < ch_end; ++c) {
+                float32_t max_val = -inf; 
+                
+                int32_t h_in_start = h_out * SP - pad_top;
+                int32_t w_in_start = w_out * SQ - pad_left;
+                
+                for (uint32_t p = 0; p < P; ++p) {
+                    int32_t h_in = h_in_start + p;
+
+                    if (h_in < 0 || h_in >= H) {
+                        continue;
+                    }
+                    
+                    for (uint32_t q = 0; q < Q; ++q) {
+                        int32_t w_in = w_in_start + q;
+                        
+                        if (w_in < 0 || w_in >= W) {
+                            continue;
+                        }
+                        
+                        uint32_t input_idx = (h_in * W + w_in) * C + c;
+                        float32_t val = pSrcA[input_idx];
+                        
+                        if (val > max_val) {
+                            max_val = val;
+                        }
+                    }
+                }
+                
+                uint32_t output_idx = (h_out * W_out + w_out) * C + c;
+                pDstC[output_idx] = max_val;
+            }
+        }
+    }
 }
\ No newline at end of file
diff --git a/TargetLibraries/PULPOpen/src/iGELU.c b/TargetLibraries/PULPOpen/src/iGELU.c
deleted file mode 100644
index f6be595b42..0000000000
--- a/TargetLibraries/PULPOpen/src/iGELU.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/* =====================================================================
- * Title:        iGELU.c
- * Description:
- *
- * $Date:        13.11.2023
- *
- * ===================================================================== */
-/*
- * Copyright (C) 2020 ETH Zurich and University of Bologna.
- *
- * Author: Moritz Scherer, ETH Zurich
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "DeeployPULPMath.h"
-
-void PULPiGELU_s8_s8(int8_t *data_in, int8_t *data_out, int32_t dataSize,
-                     int8_t b, int16_t one, int32_t input_offset,
-                     int32_t output_offset, int32_t *mul, int32_t *add,
-                     int32_t *shift) {}
diff --git a/cmake/pulp/toolchain_llvm.cmake b/cmake/pulp/toolchain_llvm.cmake
index cabfe6915d..76109b8941 100644
--- a/cmake/pulp/toolchain_llvm.cmake
+++ b/cmake/pulp/toolchain_llvm.cmake
@@ -19,6 +19,7 @@ set(CMAKE_EXECUTABLE_SUFFIX ".elf")
 add_compile_options(
   -target riscv32-unknown-elf
   -march=${ISA}
+  -mabi=ilp32f
   -ffunction-sections
   -fdata-sections
   -fomit-frame-pointer
@@ -27,7 +28,7 @@ add_compile_options(
   -DNUM_CORES=${NUM_CORES}
   -MMD
   -MP
-  --sysroot=${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv/rv32imc
+  --sysroot=${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv/rv32imf
   -fno-builtin-memcpy
   -fno-builtin-memset
 )
@@ -38,8 +39,9 @@ add_link_options(
   -MP
   -nostartfiles
   -march=${ISA}
-  -L${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv/rv32imc/lib
-  -L${TOOLCHAIN_INSTALL_DIR}/lib/clang/15.0.0/lib/baremetal/rv32imc/
+  -mabi=ilp32f
+  -L${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv/rv32imf/lib
+  -L${TOOLCHAIN_INSTALL_DIR}/lib/clang/15.0.0/lib/baremetal/rv32imf/
   -z norelro
   -fno-builtin-memcpy
   -fno-builtin-memset
@@ -50,4 +52,4 @@ link_libraries(
 )
 
 add_compile_definitions(__LINK_LD)
-add_compile_definitions(__TOOLCHAIN_LLVM__)
+add_compile_definitions(__TOOLCHAIN_LLVM__)
\ No newline at end of file
diff --git a/toolchain/meson-build-script-rv32imf.txt b/toolchain/meson-build-script-rv32imf.txt
new file mode 100644
index 0000000000..2db512dadf
--- /dev/null
+++ b/toolchain/meson-build-script-rv32imf.txt
@@ -0,0 +1,19 @@
+[binaries]
+c = ['clang', '-target', 'riscv32-unknown-elf', '-march=rv32imc', '-nostdlib']
+ar = 'llvm-ar'
+strip = 'llvm-strip'
+
+[host_machine]
+system = 'none'
+cpu_family = 'riscv32'
+cpu = 'riscv32'
+endian = 'little'
+
+[properties]
+c_args = ['-Werror=double-promotion', '-Wno-unsupported-floating-point-opt', '-fshort-enums', '-mno-relax']
+c_link_args = ['-Wl,-z,noexecstack']
+skip_sanity_check = true
+default_flash_addr = '0x00000000'
+default_flash_size = '0x00400000'
+default_ram_addr   = '0x20000000'
+default_ram_size   = '0x00200000'