diff --git a/CHANGELOG.md b/CHANGELOG.md
index 49f0972a59..877bb91915 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -16,6 +16,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Deeploy-GAP9 Platform [#143](https://github.com/pulp-platform/Deeploy/pull/143)
 - Update CLI interface Across Project, Fix Tutorial, and Remove Legacy Test [#157](https://github.com/pulp-platform/Deeploy/pull/157)
 - Fix for python error when using python 3.12.11 [#189]( https://github.com/pulp-platform/Deeploy/pull/189)
+- Add support for Operators for Generic target needed in MAGIA [#193]( https://github.com/pulp-platform/Deeploy/pull/193)
 
 ### Added
 - Add many missing docstrings
@@ -26,6 +27,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Add integer MaxPool1D for Generic platform and RQSConv1D support for PULPOpen, with corresponding kernel tests.
 - Added GAP9 Platform Support: Deployer, Bindings, Templates, Tiler, DMA (L3Dma/MchanDma), target library, CI workflows
 - Per-layer microbenchmarking on PULPOpen via `--profileMicrobenchmark`: new `PULPMicrobenchmark` code-transformation pass + `perf_utils.h` helpers report cycles, instructions, stalls and cache misses per layer in `RunNetwork`
+- Add support for the Generic target for the following operators [Ceil](https://onnx.ai/onnx/operators/onnx__Ceil.html), [Floor](https://onnx.ai/onnx/operators/onnx__Floor.html), [Clip](https://onnx.ai/onnx/operators/onnx__Clip.html), [Sub](https://onnx.ai/onnx/operators/onnx__Sub.html), [Exp](https://onnx.ai/onnx/operators/onnx__Exp.html), [Sigmoid](https://onnx.ai/onnx/operators/onnx__Sigmoid.html), [Swish](https://onnx.ai/onnx/operators/onnx__Swish.html), [HardSigmoid](https://onnx.ai/onnx/operators/onnx__HardSigmoid.html), [HardSwish](https://onnx.ai/onnx/operators/onnx__HardSwish.html), [InstanceNormalization](https://onnx.ai/onnx/operators/onnx__InstanceNormalization.html), [GroupNormalization](https://onnx.ai/onnx/operators/onnx__GroupNormalization.html), [AveragePool](https://onnx.ai/onnx/operators/onnx__AveragePool.html), [GlobalAveragePool](https://onnx.ai/onnx/operators/onnx__GlobalAveragePool.html), [GlobalMaxPool](https://onnx.ai/onnx/operators/onnx__GlobalMaxPool.html).
 
 ### Changed
 - Use by default `devel` container for GAP9 CI
diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py
index 308b179aef..21cf01e52a 100644
--- a/Deeploy/Targets/Generic/Bindings.py
+++ b/Deeploy/Targets/Generic/Bindings.py
@@ -13,13 +13,16 @@
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
 from Deeploy.Targets.Generic.Templates import AddTemplate, BatchNormalizationTemplate, ConcatTemplate, ConvTemplate, \
     ConvTransposeTemplate, DebugPrintTemplate, DequantTemplate, DummyTemplate, DWConvTemplate, FloatAddTemplate, \
-    FloatConvTemplate, FloatDivTemplate, FloatDWConvTemplate, FloatGELUTemplate, FloatGemmTemplate, \
-    FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, \
-    FloatPowTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, FloatSqrtTemplate, \
-    GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, \
-    MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \
-    RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, TransposeTemplate, \
-    iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate
+    FloatAveragePoolTemplate, FloatCeilTemplate, FloatClipTemplate, FloatConvTemplate, FloatDivTemplate, \
+    FloatDWConvTemplate, FloatExpTemplate, FloatFloorTemplate, FloatGELUTemplate, FloatGemmTemplate, \
+    FloatGlobalAveragePoolTemplate, FloatGlobalMaxPoolTemplate, FloatGroupNormTemplate, FloatHardSigmoidTemplate, \
+    FloatHardSwishTemplate, FloatInstanceNormTemplate, FloatLayernormTemplate, FloatMatMulTemplate, \
+    FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, FloatPowTemplate, FloatReduceMeanTemplate, \
+    FloatReluTemplate, FloatSigmoidTemplate, FloatSoftmaxTemplate, FloatSqrtTemplate, FloatSubTemplate, \
+    FloatSwishTemplate, GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, \
+    MatMulTemplate, MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \
+    RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, SubTemplate, \
+    TransposeTemplate, iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \
     DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \
     LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \
@@ -54,6 +57,17 @@
                 FloatAddTemplate.referenceTemplate, BasicTransformer)
 ]
 
+# using AddChecker since they are exactly the same
+BasicSubBindings = [
+    NodeBinding(AddChecker([PointerClass(type1), PointerClass(type2)], [PointerClass(int32_t)]),
+                SubTemplate.referenceTemplate, BasicTransformer)
+    for type1 in IntegerDataTypes
+    for type2 in IntegerDataTypes
+] + [
+    NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatSubTemplate.referenceTemplate, BasicTransformer)
+]
+
 BasicConv1DBindings = [
     NodeBinding(ConvChecker(
         [PointerClass(type), PointerClass(type), PointerClass(type)], [PointerClass(type)]),
@@ -327,3 +341,82 @@
         ConvTransposeTemplate.referenceTemplate,
         BasicTransformer) for type in FloatDataTypes
 ]
+
+BasicCeilBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatCeilTemplate.referenceTemplate,
+                BasicTransformer),
+]
+
+BasicFloorBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatFloorTemplate.referenceTemplate, BasicTransformer),
+]
+
+BasicClipBindings = [
+    NodeBinding(
+        DummyChecker(
+            [PointerClass(float32_t), PointerClass(float32_t),
+             PointerClass(float32_t)], [PointerClass(float32_t)]), FloatClipTemplate.referenceTemplate,
+        BasicTransformer),
+]
+
+BasicExpBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatExpTemplate.referenceTemplate,
+                BasicTransformer),
+]
+
+BasicSigmoidBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatSigmoidTemplate.referenceTemplate, BasicTransformer),
+]
+
+BasicSwishBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatSwishTemplate.referenceTemplate, BasicTransformer),
+]
+
+BasicHardSigmoidBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatHardSigmoidTemplate.referenceTemplate, BasicTransformer),
+]
+
+BasicHardSwishBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatHardSwishTemplate.referenceTemplate, BasicTransformer),
+]
+
+BasicInstanceNormBindings = [
+    NodeBinding(
+        DummyChecker(
+            [PointerClass(float32_t), PointerClass(float32_t),
+             PointerClass(float32_t)], [PointerClass(float32_t)]), FloatInstanceNormTemplate.referenceTemplate,
+        BasicTransformer),
+]
+
+BasicGroupNormBindings = [
+    NodeBinding(
+        DummyChecker(
+            [PointerClass(float32_t), PointerClass(float32_t),
+             PointerClass(float32_t)], [PointerClass(float32_t)]), FloatGroupNormTemplate.referenceTemplate,
+        BasicTransformer),
+]
+
+BasicAveragePool1DBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatAveragePoolTemplate.referenceTemplate1d, BasicTransformer)
+]
+
+BasicAveragePool2DBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatAveragePoolTemplate.referenceTemplate2d, BasicTransformer)
+]
+
+BasicGlobalAveragePoolBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatGlobalAveragePoolTemplate.referenceTemplate, BasicTransformer)
+]
+
+BasicGlobalMaxPoolBindings = [
+    NodeBinding(DummyChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatGlobalMaxPoolTemplate.referenceTemplate, BasicTransformer)
+]
diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
index cc733937cc..605b8cf782 100644
--- a/Deeploy/Targets/Generic/Layers.py
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -188,6 +188,9 @@ def computeOps(self):
         return self.mapper.parser.operatorRepresentation['size']
 
 
+SubLayer = AddLayer
+
+
 class MatMulLayer(ONNXLayer):
 
     def __init__(self, maps: List[NodeMapper]):
@@ -709,3 +712,99 @@ def computeOps(self):
             numPx = opRep['dim_im_out_x']
 
         return numPx * opsPerPx
+
+
+class CeilLayer(ONNXLayer):
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
+
+class FloorLayer(ONNXLayer):
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
+
+class ClipLayer(ONNXLayer):
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size'] * 2  # compare vs min and max
+
+
+class ExpLayer(ONNXLayer):
+
+    def computeOps(self):
+        return self.mapper.parser.operatorRepresentation['size']
+
+
+class SigmoidLayer(ONNXLayer):
+
+    def computeOps(self):
+        # sigmoid(x) = 1 / (1 + exp(-x)): neg, exp, add, div
+        return self.mapper.parser.operatorRepresentation['size'] * 4
+
+
+class SwishLayer(ONNXLayer):
+
+    def computeOps(self):
+        # x * sigmoid(x): 4 ops for sigmoid + 1 mul
+        return self.mapper.parser.operatorRepresentation['size'] * 5
+
+
+class HardSigmoidLayer(ONNXLayer):
+
+    def computeOps(self):
+        # max(0, min(1, alpha*x + beta)): mul, add, clip(min), clip(max)
+        return self.mapper.parser.operatorRepresentation['size'] * 4
+
+
+class HardSwishLayer(ONNXLayer):
+
+    def computeOps(self):
+        # x * HardSigmoid(x): 4 ops for hard sigmoid + 1 mul
+        return self.mapper.parser.operatorRepresentation['size'] * 5
+
+
+class InstanceNormLayer(ONNXLayer):
+
+    def computeOps(self):
+        # per element: mean-sum(1) + variance(sub+sq+add=3) + normalize(sub+div=2) + affine(mul+add=2) = 8
+        # per (batch, channel): mean(div=1) + variance(sqrt+div=2) = 3
+        opRep = self.mapper.parser.operatorRepresentation
+        B, C, S = int(opRep['batch_size']), int(opRep['num_channels']), int(opRep['spatial'])
+        return B * C * (S * 8 + 3)
+
+
+class GroupNormLayer(ONNXLayer):
+
+    def computeOps(self):
+        # same structure as InstanceNorm: 8 ops/element + 3 ops per (batch, channel)
+        opRep = self.mapper.parser.operatorRepresentation
+        B, C, S = int(opRep['batch_size']), int(opRep['num_channels']), int(opRep['spatial'])
+        return B * C * (S * 8 + 3)
+
+
+class AveragePoolLayer(ONNXLayer):
+
+    def computeOps(self):
+        opRep = self.mapper.parser.operatorRepresentation
+        kernel_elements = int(np.prod(opRep['kernel_shape']))
+        # (kernel_elements - 1) additions + 1 division per output element
+        return opRep['data_out_size'] * kernel_elements
+
+
+class GlobalAveragePoolLayer(ONNXLayer):
+
+    def computeOps(self):
+        opRep = self.mapper.parser.operatorRepresentation
+        # (spatial_size - 1) additions + 1 division per output channel
+        return int(opRep['batch_size'] * opRep['num_channels'] * opRep['spatial_size'])
+
+
+class GlobalMaxPoolLayer(ONNXLayer):
+
+    def computeOps(self):
+        opRep = self.mapper.parser.operatorRepresentation
+        # (spatial_size - 1) comparisons per output channel
+        return int(opRep['batch_size'] * opRep['num_channels'] * (opRep['spatial_size'] - 1))
diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py
index ad787d9e4b..c750402198 100644
--- a/Deeploy/Targets/Generic/Parsers.py
+++ b/Deeploy/Targets/Generic/Parsers.py
@@ -11,6 +11,23 @@
 from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeParser, VariableBuffer
 
 
+class UnaryElementWiseParser(NodeParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return len(node.inputs) == 1 and len(node.outputs) == 1
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        return ctxt, True
+
+
 class ConcatParser(NodeParser):
 
     def __init__(self):
@@ -492,6 +509,9 @@ def parseNodeCtxt(self,
         return ctxt, True
 
 
+SubParser = AddParser
+
+
 class ReduceParser(NodeParser):
 
     def __init__(self):
@@ -1092,29 +1112,10 @@ def parseNodeCtxt(self,
         return ctxt, True
 
 
-class ReluParser(NodeParser):
-
-    def __init__(self):
-        super().__init__()
-
-    def parseNode(self, node: gs.Node) -> (bool):
-
-        ret = all([len(node.inputs) == 1, len(node.outputs) == 1])
-
-        return ret
-
-    def parseNodeCtxt(self,
-                      ctxt: NetworkContext,
-                      node: gs.Node,
-                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
-
-        data_in = ctxt.lookup(node.inputs[0].name)
-        data_out = ctxt.lookup(node.outputs[0].name)
-        self.operatorRepresentation['data_in'] = data_in.name
-        self.operatorRepresentation['data_out'] = data_out.name
-        self.operatorRepresentation['size'] = np.prod(data_in.shape)
+class ReluParser(UnaryElementWiseParser):
 
-        return ctxt, True
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and node.op == 'Relu'
 
 
 class ReshapeParser(NodeParser):
@@ -2865,13 +2866,185 @@ def parseNodeCtxt(self,
         return ctxt, False
 
 
-class SqrtParser(NodeParser):
+class SqrtParser(UnaryElementWiseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and node.op == 'Sqrt'
+
+
+class CeilParser(UnaryElementWiseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and node.op == 'Ceil'
+
+
+class FloorParser(UnaryElementWiseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and node.op == 'Floor'
+
+
+class ClipParser(UnaryElementWiseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        # Clip allows 1–3 inputs (optional min/max constants), so we can't use super()
+        if node.op != 'Clip' \
+            or len(node.outputs) != 1 \
+            or (not (1 <= len(node.inputs) <= 3)):
+            return False
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        ctxt, ok = super().parseNodeCtxt(ctxt, node, channels_first)
+        if not ok:
+            return ctxt, False
+
+        # min_val and max_val only handled as constants
+        # Defaults: full float32 range
+        self.operatorRepresentation['min_val'] = -np.finfo(np.float32).max
+        self.operatorRepresentation['max_val'] = np.finfo(np.float32).max
+
+        if len(node.inputs) > 1 and isinstance(node.inputs[1], gs.Constant) and node.inputs[1].name != '':
+            self.operatorRepresentation['min_val'] = float(node.inputs[1].values.item())
+        if len(node.inputs) > 2 and isinstance(node.inputs[2], gs.Constant) and node.inputs[2].name != '':
+            self.operatorRepresentation['max_val'] = float(node.inputs[2].values.item())
+
+        return ctxt, True
+
+
+class ExpParser(UnaryElementWiseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and node.op == 'Exp'
+
+
+class SigmoidParser(UnaryElementWiseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and node.op == 'Sigmoid'
+
+
+class SwishParser(UnaryElementWiseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        if not (super().parseNode(node) and node.op == 'Swish'):
+            return False
+        self.operatorRepresentation['alpha'] = node.attrs.get('alpha', 1.0)
+        return True
+
+
+class HardSigmoidParser(UnaryElementWiseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        if not (super().parseNode(node) and node.op == 'HardSigmoid'):
+            return False
+        self.operatorRepresentation['alpha'] = node.attrs.get('alpha', 0.2)
+        self.operatorRepresentation['beta'] = node.attrs.get('beta', 0.5)
+        return True
 
-    def __init__(self):
-        super().__init__()
+
+class HardSwishParser(UnaryElementWiseParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and node.op == 'HardSwish'
+
+
+class NormalizationParser(NodeParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        if not all([
+                len(node.inputs) == 3,
+                len(node.outputs) == 1,
+        ]):
+            return False
+
+        self.operatorRepresentation['epsilon'] = node.attrs.get('epsilon', 1e-5)
+
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        data_in = ctxt.lookup(node.inputs[0].name)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['scale'] = ctxt.lookup(node.inputs[1].name).name
+        self.operatorRepresentation['bias'] = ctxt.lookup(node.inputs[2].name).name
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = ctxt.lookup(node.outputs[0].name).name
+        self.operatorRepresentation['batch_size'] = data_in.shape[0]
+        self.operatorRepresentation['num_channels'] = data_in.shape[1]
+        self.operatorRepresentation['spatial'] = np.prod(data_in.shape[2:])
+        return ctxt, True
+
+
+class InstanceNormParser(NormalizationParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and node.op == 'InstanceNormalization'
+
+
+class GroupNormParser(NormalizationParser):
+
+    # TODO: attribute stash_type not handled
+    def parseNode(self, node: gs.Node) -> bool:
+        if not all([
+                super().parseNode(node),
+                node.op == 'GroupNormalization',
+                'num_groups' in node.attrs,
+        ]):
+            return False
+        self.operatorRepresentation['num_groups'] = node.attrs['num_groups']
+        return True
+
+
+class AveragePoolParser(NodeParser):
 
     def parseNode(self, node: gs.Node) -> bool:
-        return node.op == 'Sqrt' and len(node.inputs) == 1 and len(node.outputs) == 1
+
+        if not all([
+                node.op == 'AveragePool',
+                len(node.inputs) == 1,
+                len(node.outputs) == 1,
+                'kernel_shape' in node.attrs,
+        ]):
+            return False
+
+        kernel_shape = node.attrs['kernel_shape']
+        spatial_ndim = len(kernel_shape)
+
+        auto_pad = node.attrs.get('auto_pad', 'NOTSET')
+        ceil_mode = node.attrs.get('ceil_mode', 0)
+        count_include_pad = node.attrs.get('count_include_pad', 0)
+        dilations = node.attrs.get('dilations', (1,) * spatial_ndim)
+        strides = node.attrs.get('strides', (1,) * spatial_ndim)
+        pads = node.attrs.get('pads', (0,) * (2 * spatial_ndim))
+
+        if not all([
+                auto_pad == 'NOTSET',  # TODO: implement other values
+                ceil_mode == 0,  # TODO: implement other values
+                count_include_pad == 0,  # TODO: implement other values
+                all([d == 1 for d in dilations]),  # TODO: implement other values
+                len(dilations) == spatial_ndim,
+                len(strides) == spatial_ndim,
+                len(pads) == 2 * spatial_ndim,
+                all([s > 0 for s in strides]),
+        ]):
+            return False
+
+        self.operatorRepresentation['kernel_shape'] = kernel_shape
+        self.operatorRepresentation['auto_pad'] = auto_pad
+        self.operatorRepresentation['ceil_mode'] = ceil_mode
+        self.operatorRepresentation['count_include_pad'] = count_include_pad
+        self.operatorRepresentation['dilations'] = dilations
+        self.operatorRepresentation['strides'] = strides
+        self.operatorRepresentation['pads'] = pads
+
+        return True
 
     def parseNodeCtxt(self,
                       ctxt: NetworkContext,
@@ -2880,9 +3053,68 @@ def parseNodeCtxt(self,
 
         data_in = ctxt.lookup(node.inputs[0].name)
         data_out = ctxt.lookup(node.outputs[0].name)
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+
+        self.operatorRepresentation['batch_size'] = data_in.shape[0]
+        self.operatorRepresentation['num_channels'] = data_in.shape[1]
+        self.operatorRepresentation['data_out_size'] = int(np.prod(data_out.shape))
+
+        spatial_shape = data_in.shape[2:]
+        if len(self.operatorRepresentation['kernel_shape']) != len(spatial_shape):
+            return ctxt, False
+
+        if len(spatial_shape) == 1:
+            self.operatorRepresentation['length'] = spatial_shape[0]
+        elif len(spatial_shape) == 2:
+            self.operatorRepresentation['height'] = spatial_shape[0]
+            self.operatorRepresentation['width'] = spatial_shape[1]
+        else:
+            return ctxt, False
+
+        return ctxt, True
+
+
+class AveragePool1DParser(AveragePoolParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and len(node.attrs['kernel_shape']) == 1
+
+
+class AveragePool2DParser(AveragePoolParser):
 
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and len(node.attrs['kernel_shape']) == 2
+
+
+class GlobalPoolParser(NodeParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return len(node.inputs) == 1 and len(node.outputs) == 1
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
         self.operatorRepresentation['data_in'] = data_in.name
         self.operatorRepresentation['data_out'] = data_out.name
-        self.operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        self.operatorRepresentation['batch_size'] = data_in.shape[0]
+        self.operatorRepresentation['num_channels'] = data_in.shape[1]
+        self.operatorRepresentation['spatial_size'] = np.prod(data_in.shape[2:])
 
         return ctxt, True
+
+
+class GlobalAveragePoolParser(GlobalPoolParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and node.op == 'GlobalAveragePool'
+
+
+class GlobalMaxPoolParser(GlobalPoolParser):
+
+    def parseNode(self, node: gs.Node) -> bool:
+        return super().parseNode(node) and node.op == 'GlobalMaxPool'
diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py
index e05e897270..2aa1ef1c38 100644
--- a/Deeploy/Targets/Generic/Platform.py
+++ b/Deeploy/Targets/Generic/Platform.py
@@ -6,33 +6,40 @@
     RemoveEmptyConvBiasPass, RemoveOnlySingletonReduceMeanPass
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
     StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
-from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicBatchNormBindings, BasicConcatBindings, \
-    BasicConv1DBindings, BasicConv2DBindings, BasicConvTransposeBindings, BasicDebugPrintBindings, \
-    BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, BasicGatherBindings, \
-    BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \
-    BasicLayerNormBindings, BasicMatMulBindings, BasicMaxPool1DBindings, BasicMaxPool2DBindings, BasicMulBindings, \
-    BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, \
-    BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \
-    BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, \
-    DummyBinding
-from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \
-    ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \
-    LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \
-    ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, \
-    SoftmaxLayer, SqrtLayer, TransposeLayer
-from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, ConvTranspose1DParser, \
-    DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \
-    GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \
-    IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \
-    Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \
-    RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, \
-    TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser
+from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicAveragePool1DBindings, BasicAveragePool2DBindings, \
+    BasicBatchNormBindings, BasicCeilBindings, BasicClipBindings, BasicConcatBindings, BasicConv1DBindings, \
+    BasicConv2DBindings, BasicConvTransposeBindings, BasicDebugPrintBindings, BasicDequantBindings, BasicDivBindings, \
+    BasicDWConv1DBinding, BasicDWConv2DBindings, BasicExpBindings, BasicFloorBindings, BasicGatherBindings, \
+    BasicGELUBindings, BasicGEMMBindings, BasicGlobalAveragePoolBindings, BasicGlobalMaxPoolBindings, \
+    BasicGroupNormBindings, BasicHardSigmoidBindings, BasicHardSwishBindings, BasicInstanceNormBindings, \
+    BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, BasicLayerNormBindings, BasicMatMulBindings, \
+    BasicMaxPool1DBindings, BasicMaxPool2DBindings, BasicMulBindings, BasicPad1DBindings, BasicPad2DBindings, \
+    BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, BasicReduceSumBindings, BasicReluBinding, \
+    BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, BasicRQSGELUBinding, BasicSigmoidBindings, \
+    BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicSubBindings, BasicSwishBindings, \
+    BasicTransposeBindings, DummyBinding
+from Deeploy.Targets.Generic.Layers import AddLayer, AveragePoolLayer, BatchNormalizationLayer, CeilLayer, ClipLayer, \
+    ConcatLayer, ConvLayer, ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, ExpLayer, FloorLayer, \
+    GatherLayer, GELULayer, GEMMLayer, GlobalAveragePoolLayer, GlobalMaxPoolLayer, GroupNormLayer, InstanceNormLayer, \
+    ITAMaxLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \
+    ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SigmoidLayer, \
+    SliceLayer, SoftmaxLayer, SqrtLayer, SubLayer, SwishLayer, TransposeLayer
+from Deeploy.Targets.Generic.Parsers import AddParser, AveragePool1DParser, AveragePool2DParser, BatchNormParser, \
+    CeilParser, ClipParser, ConcatParser, ConvTranspose1DParser, DebugParser, DequantParser, DivParser, DummyParser, \
+    ExpParser, FlattenParser, FloorParser, GatherParser, GELUParser, GenericConv1DParser, GenericConv2DParser, \
+    GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, GlobalAveragePoolParser, \
+    GlobalMaxPoolParser, GroupNormParser, HardSigmoidParser, HardSwishParser, InstanceNormParser, IntegerDivParser, \
+    ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, Pad1DParser, \
+    Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, \
+    ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SigmoidParser, SliceParser, SoftmaxParser, SqrtParser, \
+    SubParser, SwishParser, TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \
     ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \
     iGELURequantMergePass
 
 AddMapper = NodeMapper(AddParser(), BasicAddBindings)
+SubMapper = NodeMapper(SubParser(), BasicSubBindings)
 Conv1DMapper = NodeMapper(GenericConv1DParser(), BasicConv1DBindings)
 Conv2DMapper = NodeMapper(GenericConv2DParser(), BasicConv2DBindings)
 ConcatMapper = NodeMapper(ConcatParser(), BasicConcatBindings)
@@ -73,6 +80,20 @@
 BatchNormalizationMapper = NodeMapper(BatchNormParser(), BasicBatchNormBindings)
 ConvTransposeMapper = NodeMapper(ConvTranspose1DParser(), BasicConvTransposeBindings)
 SliceMapper = NodeMapper(SliceParser(), BasicSliceBindings)
+CeilMapper = NodeMapper(CeilParser(), BasicCeilBindings)
+FloorMapper = NodeMapper(FloorParser(), BasicFloorBindings)
+ClipMapper = NodeMapper(ClipParser(), BasicClipBindings)
+ExpMapper = NodeMapper(ExpParser(), BasicExpBindings)
+SigmoidMapper = NodeMapper(SigmoidParser(), BasicSigmoidBindings)
+SwishMapper = NodeMapper(SwishParser(), BasicSwishBindings)
+HardSigmoidMapper = NodeMapper(HardSigmoidParser(), BasicHardSigmoidBindings)
+HardSwishMapper = NodeMapper(HardSwishParser(), BasicHardSwishBindings)
+InstanceNormMapper = NodeMapper(InstanceNormParser(), BasicInstanceNormBindings)
+GroupNormMapper = NodeMapper(GroupNormParser(), BasicGroupNormBindings)
+AveragePool1DMapper = NodeMapper(AveragePool1DParser(), BasicAveragePool1DBindings)
+AveragePool2DMapper = NodeMapper(AveragePool2DParser(), BasicAveragePool2DBindings)
+GlobalAveragePoolMapper = NodeMapper(GlobalAveragePoolParser(), BasicGlobalAveragePoolBindings)
+GlobalMaxPoolMapper = NodeMapper(GlobalMaxPoolParser(), BasicGlobalMaxPoolBindings)
 
 # Dummy nodes are intended for development purposes only!
 # They should always generate compiler errors to not accidentally end up in production code
@@ -80,6 +101,7 @@
 
 GenericMapping = {
     'Add': AddLayer([AddMapper]),
+    'Sub': SubLayer([SubMapper]),
     'Conv': ConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]),
     'Concat': ConcatLayer([ConcatMapper]),
     'DebugPrint': DebugPrintLayer([DebugMapper]),
@@ -118,7 +140,20 @@
     'Quant': QuantLayer([QuantMapper]),
     'Dequant': DequantLayer([DequantMapper]),
     'BatchNormalization': BatchNormalizationLayer([BatchNormalizationMapper]),
-    'ConvTranspose': ConvTransposeLayer([ConvTransposeMapper])
+    'ConvTranspose': ConvTransposeLayer([ConvTransposeMapper]),
+    'Ceil': CeilLayer([CeilMapper]),
+    'Floor': FloorLayer([FloorMapper]),
+    'Clip': ClipLayer([ClipMapper]),
+    'Exp': ExpLayer([ExpMapper]),
+    'Sigmoid': SigmoidLayer([SigmoidMapper]),
+    'Swish': SwishLayer([SwishMapper]),
+    'HardSigmoid': SigmoidLayer([HardSigmoidMapper]),
+    'HardSwish': SwishLayer([HardSwishMapper]),
+    'InstanceNormalization': InstanceNormLayer([InstanceNormMapper]),
+    'GroupNormalization': GroupNormLayer([GroupNormMapper]),
+    'AveragePool': AveragePoolLayer([AveragePool1DMapper, AveragePool2DMapper]),
+    'GlobalAveragePool': GlobalAveragePoolLayer([GlobalAveragePoolMapper]),
+    'GlobalMaxPool': GlobalMaxPoolLayer([GlobalMaxPoolMapper]),
     # # For example, you can use the DummpyMapper, in case you want to test
     # # deployment or optimizations with GlobalAveragePool nodes but did not yet
     # # implement the corresponding kernel
diff --git a/Deeploy/Targets/Generic/Templates/FloatAveragePoolTemplate.py b/Deeploy/Targets/Generic/Templates/FloatAveragePoolTemplate.py
new file mode 100644
index 0000000000..36519dacc2
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatAveragePoolTemplate.py
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _AveragePoolTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate1d = _AveragePoolTemplate("""
+// Average Pool 1D (Name: ${nodeName}, Op: ${nodeOp})
+AveragePool1d_fp${type_width}_fp${type_width}(
+    ${data_in}, ${data_out}, ${batch_size}, ${num_channels}, ${length}, ${kernel_shape[0]},
+    ${strides[0]}, ${pads[0]}, ${pads[1]});
+""")
+
+referenceTemplate2d = _AveragePoolTemplate("""
+// Average Pool 2D (Name: ${nodeName}, Op: ${nodeOp})
+AveragePool2d_fp${type_width}_fp${type_width}(
+    ${data_in}, ${data_out}, ${batch_size}, ${num_channels}, ${height}, ${width},
+    ${kernel_shape[0]}, ${kernel_shape[1]}, ${strides[0]}, ${strides[1]},
+    ${pads[0]}, ${pads[1]}, ${pads[2]}, ${pads[3]});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatCeilTemplate.py b/Deeploy/Targets/Generic/Templates/FloatCeilTemplate.py
new file mode 100644
index 0000000000..198bb3d9c8
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatCeilTemplate.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _CeilTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _CeilTemplate("""
+// Ceil (Name: ${nodeName}, Op: ${nodeOp})
+Ceil_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${size});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatClipTemplate.py b/Deeploy/Targets/Generic/Templates/FloatClipTemplate.py
new file mode 100644
index 0000000000..c61b421755
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatClipTemplate.py
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _ClipTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _ClipTemplate("""
+// Clip (Name: ${nodeName}, Op: ${nodeOp})
+Clip_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${min_val}, ${max_val}, ${size});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatExpTemplate.py b/Deeploy/Targets/Generic/Templates/FloatExpTemplate.py
new file mode 100644
index 0000000000..734d7e0fea
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatExpTemplate.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _ExpTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _ExpTemplate("""
+// Exp (Name: ${nodeName}, Op: ${nodeOp})
+Exp_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${size});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatFloorTemplate.py b/Deeploy/Targets/Generic/Templates/FloatFloorTemplate.py
new file mode 100644
index 0000000000..2d9768c1f4
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatFloorTemplate.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _FloorTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _FloorTemplate("""
+// Floor (Name: ${nodeName}, Op: ${nodeOp})
+Floor_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${size});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatGlobalAveragePoolTemplate.py b/Deeploy/Targets/Generic/Templates/FloatGlobalAveragePoolTemplate.py
new file mode 100644
index 0000000000..519fd8e82b
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatGlobalAveragePoolTemplate.py
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _GlobalAveragePoolTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _GlobalAveragePoolTemplate("""
+// Global Average Pool 1D (Name: ${nodeName}, Op: ${nodeOp})
+GlobalAveragePool_fp${type_width}_fp${type_width}(
+    ${data_in}, ${data_out}, ${batch_size}, ${num_channels}, ${spatial_size});
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/Templates/FloatGlobalMaxPoolTemplate.py b/Deeploy/Targets/Generic/Templates/FloatGlobalMaxPoolTemplate.py
new file mode 100644
index 0000000000..c41743a898
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatGlobalMaxPoolTemplate.py
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _GlobalMaxPoolTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _GlobalMaxPoolTemplate("""
+// Global Max Pool 1D (Name: ${nodeName}, Op: ${nodeOp})
+GlobalMaxPool_fp${type_width}_fp${type_width}(
+    ${data_in}, ${data_out}, ${batch_size}, ${num_channels}, ${spatial_size});
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/Templates/FloatGroupNormTemplate.py b/Deeploy/Targets/Generic/Templates/FloatGroupNormTemplate.py
new file mode 100644
index 0000000000..9c42d8011c
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatGroupNormTemplate.py
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _GroupNormTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _GroupNormTemplate("""
+// Group Normalization (Name: ${nodeName}, Op: ${nodeOp})
+GroupNormalization_fp${type_width}_fp${type_width}(
+    ${data_in}, ${data_out}, ${scale}, ${bias},
+    ${batch_size}, ${num_channels}, ${spatial}, ${num_groups}, ${epsilon});
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/Templates/FloatHardSigmoidTemplate.py b/Deeploy/Targets/Generic/Templates/FloatHardSigmoidTemplate.py
new file mode 100644
index 0000000000..135f168c3f
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatHardSigmoidTemplate.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _hardSigmoidTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _hardSigmoidTemplate("""
+// HardSigmoid (Name: ${nodeName}, Op: ${nodeOp})
+HardSigmoid_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${alpha}, ${beta}, ${size});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatHardSwishTemplate.py b/Deeploy/Targets/Generic/Templates/FloatHardSwishTemplate.py
new file mode 100644
index 0000000000..6ff5c11c77
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatHardSwishTemplate.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _hardSwishTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _hardSwishTemplate("""
+// HardSwish (Name: ${nodeName}, Op: ${nodeOp})
+HardSwish_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${size});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatInstanceNormTemplate.py b/Deeploy/Targets/Generic/Templates/FloatInstanceNormTemplate.py
new file mode 100644
index 0000000000..efcfce5f86
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatInstanceNormTemplate.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _InstanceNormTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _InstanceNormTemplate("""
+// Instance Normalization (Name: ${nodeName}, Op: ${nodeOp})
+InstanceNormalization_fp${type_width}_fp${type_width}(
+    ${data_in}, ${data_out}, ${scale}, ${bias}, ${batch_size}, ${num_channels}, ${spatial}, ${epsilon});
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Generic/Templates/FloatSigmoidTemplate.py b/Deeploy/Targets/Generic/Templates/FloatSigmoidTemplate.py
new file mode 100644
index 0000000000..a25bf411e5
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatSigmoidTemplate.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _SigmoidTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _SigmoidTemplate("""
+// Sigmoid (Name: ${nodeName}, Op: ${nodeOp})
+Sigmoid_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${size});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatSubTemplate.py b/Deeploy/Targets/Generic/Templates/FloatSubTemplate.py
new file mode 100644
index 0000000000..fcae7e1c0d
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatSubTemplate.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Add (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    for (uint32_t i=0;i<${size};i++){
+        ${data_out}[i] = ${data_in_1}[i] - ${data_in_2}[i];
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatSwishTemplate.py b/Deeploy/Targets/Generic/Templates/FloatSwishTemplate.py
new file mode 100644
index 0000000000..244e19ee0b
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatSwishTemplate.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+import numpy as np
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _SigmoidTemplate(NodeTemplate):
+
+    def alignToContext(self, ctxt: NetworkContext,
+                       operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, dict, list[str]]:
+
+        data_in = ctxt.lookup(operatorRepresentation['data_in'])
+        operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        operatorRepresentation['type_width'] = data_in._type.referencedType.typeWidth
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _SigmoidTemplate("""
+// Swish (Name: ${nodeName}, Op: ${nodeOp})
+Swish_fp${type_width}_fp${type_width}(${data_in}, ${data_out}, ${alpha}, ${size});
+""")
diff --git a/Deeploy/Targets/Generic/Templates/SubTemplate.py b/Deeploy/Targets/Generic/Templates/SubTemplate.py
new file mode 100644
index 0000000000..e5fade91ef
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/SubTemplate.py
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: 2021 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class _SubTemplate(NodeTemplate):
+
+    def alignToContext(
+            self, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> tuple[NetworkContext, OperatorRepresentation, list[str]]:
+
+        data_in_1 = ctxt.lookup(operatorRepresentation['data_in_1'])
+        data_in_2 = ctxt.lookup(operatorRepresentation['data_in_2'])
+        data_out = ctxt.lookup(operatorRepresentation['data_out'])
+
+        input_1_offset = 0
+        if hasattr(data_in_1, "_signed") and hasattr(data_in_1, "nLevels"):
+            input_1_offset = -(data_in_1._signed == 0) * int(data_in_1.nLevels / 2)
+        input_2_offset = 0
+        if hasattr(data_in_2, "_signed") and hasattr(data_in_2, "nLevels"):
+            input_2_offset = (data_in_2._signed == 0) * int(data_in_2.nLevels / 2)
+        output_offset = 0
+        if hasattr(data_out, "_signed") and hasattr(data_out, "nLevels"):
+            output_offset = (data_out._signed == 0) * int(data_out.nLevels // 2)
+
+        operatorRepresentation['offset'] = input_1_offset + input_2_offset + output_offset
+
+        return ctxt, operatorRepresentation, []
+
+
+referenceTemplate = _SubTemplate("""
+// Sub (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    for (uint32_t i = 0; i < ${size}; i++){
+        ${data_out}[i] = ${data_in_1}[i] - ${data_in_2}[i] + ${offset};
+    }
+END_SINGLE_CORE
+""")
diff --git a/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_1D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_1D/inputs.npz
new file mode 100644
index 0000000000..ac58fc00e2
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_1D/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_1D/network.onnx b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_1D/network.onnx
new file mode 100644
index 0000000000..9472fe8a05
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_1D/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_1D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_1D/outputs.npz
new file mode 100644
index 0000000000..ca18db8983
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_1D/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_2D/inputs.npz
new file mode 100644
index 0000000000..b80b42275c
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_2D/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_2D/network.onnx
new file mode 100644
index 0000000000..f69e84c010
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_2D/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_2D/outputs.npz
new file mode 100644
index 0000000000..1e6f505c5d
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/AveragePool/Regular_2D/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Ceil/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Ceil/inputs.npz
new file mode 100644
index 0000000000..ac58fc00e2
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Ceil/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Ceil/network.onnx b/DeeployTest/Tests/Kernels/FP32/Ceil/network.onnx
new file mode 100644
index 0000000000..d24a1981a0
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Ceil/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Ceil/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Ceil/outputs.npz
new file mode 100644
index 0000000000..0911ac14bf
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Ceil/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Clip/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Clip/inputs.npz
new file mode 100644
index 0000000000..ac58fc00e2
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Clip/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Clip/network.onnx b/DeeployTest/Tests/Kernels/FP32/Clip/network.onnx
new file mode 100644
index 0000000000..e79b10d0a1
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Clip/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Clip/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Clip/outputs.npz
new file mode 100644
index 0000000000..aba055ba03
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Clip/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Exp/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Exp/inputs.npz
new file mode 100644
index 0000000000..ac58fc00e2
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Exp/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Exp/network.onnx b/DeeployTest/Tests/Kernels/FP32/Exp/network.onnx
new file mode 100644
index 0000000000..fc64515614
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Exp/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Exp/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Exp/outputs.npz
new file mode 100644
index 0000000000..8d57518ae0
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Exp/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Floor/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Floor/inputs.npz
new file mode 100644
index 0000000000..ac58fc00e2
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Floor/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Floor/network.onnx b/DeeployTest/Tests/Kernels/FP32/Floor/network.onnx
new file mode 100644
index 0000000000..d570c282eb
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Floor/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Floor/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Floor/outputs.npz
new file mode 100644
index 0000000000..93c0cb3bd5
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Floor/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GlobalAveragePool/inputs.npz b/DeeployTest/Tests/Kernels/FP32/GlobalAveragePool/inputs.npz
new file mode 100644
index 0000000000..b80b42275c
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GlobalAveragePool/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GlobalAveragePool/network.onnx b/DeeployTest/Tests/Kernels/FP32/GlobalAveragePool/network.onnx
new file mode 100644
index 0000000000..4c7238af40
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GlobalAveragePool/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GlobalAveragePool/outputs.npz b/DeeployTest/Tests/Kernels/FP32/GlobalAveragePool/outputs.npz
new file mode 100644
index 0000000000..2b68d327d0
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GlobalAveragePool/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GlobalMaxPool/inputs.npz b/DeeployTest/Tests/Kernels/FP32/GlobalMaxPool/inputs.npz
new file mode 100644
index 0000000000..b80b42275c
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GlobalMaxPool/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GlobalMaxPool/network.onnx b/DeeployTest/Tests/Kernels/FP32/GlobalMaxPool/network.onnx
new file mode 100644
index 0000000000..76bf8f7c37
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GlobalMaxPool/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GlobalMaxPool/outputs.npz b/DeeployTest/Tests/Kernels/FP32/GlobalMaxPool/outputs.npz
new file mode 100644
index 0000000000..5c74873cb5
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GlobalMaxPool/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GroupNorm/inputs.npz b/DeeployTest/Tests/Kernels/FP32/GroupNorm/inputs.npz
new file mode 100644
index 0000000000..b80b42275c
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GroupNorm/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GroupNorm/network.onnx b/DeeployTest/Tests/Kernels/FP32/GroupNorm/network.onnx
new file mode 100644
index 0000000000..be2ab5484c
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GroupNorm/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/GroupNorm/outputs.npz b/DeeployTest/Tests/Kernels/FP32/GroupNorm/outputs.npz
new file mode 100644
index 0000000000..c1d73d6d67
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/GroupNorm/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/HardSigmoid/inputs.npz b/DeeployTest/Tests/Kernels/FP32/HardSigmoid/inputs.npz
new file mode 100644
index 0000000000..ac58fc00e2
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/HardSigmoid/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/HardSigmoid/network.onnx b/DeeployTest/Tests/Kernels/FP32/HardSigmoid/network.onnx
new file mode 100644
index 0000000000..17b5354858
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/HardSigmoid/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/HardSigmoid/outputs.npz b/DeeployTest/Tests/Kernels/FP32/HardSigmoid/outputs.npz
new file mode 100644
index 0000000000..2e63fd2da1
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/HardSigmoid/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/HardSwish/inputs.npz b/DeeployTest/Tests/Kernels/FP32/HardSwish/inputs.npz
new file mode 100644
index 0000000000..ac58fc00e2
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/HardSwish/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/HardSwish/network.onnx b/DeeployTest/Tests/Kernels/FP32/HardSwish/network.onnx
new file mode 100644
index 0000000000..281ddf23b0
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/HardSwish/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/HardSwish/outputs.npz b/DeeployTest/Tests/Kernels/FP32/HardSwish/outputs.npz
new file mode 100644
index 0000000000..d46d07aefe
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/HardSwish/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/InstanceNorm/inputs.npz b/DeeployTest/Tests/Kernels/FP32/InstanceNorm/inputs.npz
new file mode 100644
index 0000000000..b80b42275c
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/InstanceNorm/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/InstanceNorm/network.onnx b/DeeployTest/Tests/Kernels/FP32/InstanceNorm/network.onnx
new file mode 100644
index 0000000000..c817bc0c30
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/InstanceNorm/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/InstanceNorm/outputs.npz b/DeeployTest/Tests/Kernels/FP32/InstanceNorm/outputs.npz
new file mode 100644
index 0000000000..ace60623d0
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/InstanceNorm/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Sigmoid/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Sigmoid/inputs.npz
new file mode 100644
index 0000000000..ac58fc00e2
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Sigmoid/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Sigmoid/network.onnx b/DeeployTest/Tests/Kernels/FP32/Sigmoid/network.onnx
new file mode 100644
index 0000000000..be561ee8a8
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Sigmoid/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Sigmoid/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Sigmoid/outputs.npz
new file mode 100644
index 0000000000..9bb1aebe67
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Sigmoid/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Sub/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Sub/inputs.npz
new file mode 100644
index 0000000000..c4bfb1f89b
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Sub/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Sub/network.onnx b/DeeployTest/Tests/Kernels/FP32/Sub/network.onnx
new file mode 100644
index 0000000000..b82f4c7c13
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Sub/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Sub/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Sub/outputs.npz
new file mode 100644
index 0000000000..805378eb88
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Sub/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Swish/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Swish/inputs.npz
new file mode 100644
index 0000000000..ac58fc00e2
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Swish/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Swish/network.onnx b/DeeployTest/Tests/Kernels/FP32/Swish/network.onnx
new file mode 100644
index 0000000000..9b5251da35
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Swish/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Swish/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Swish/outputs.npz
new file mode 100644
index 0000000000..cfd41c40cd
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Swish/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/Integer/Sub/inputs.npz b/DeeployTest/Tests/Kernels/Integer/Sub/inputs.npz
new file mode 100644
index 0000000000..411fad498f
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Sub/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/Integer/Sub/network.onnx b/DeeployTest/Tests/Kernels/Integer/Sub/network.onnx
new file mode 100644
index 0000000000..b82f4c7c13
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Sub/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/Integer/Sub/outputs.npz b/DeeployTest/Tests/Kernels/Integer/Sub/outputs.npz
new file mode 100644
index 0000000000..2b1dc905cc
Binary files /dev/null and b/DeeployTest/Tests/Kernels/Integer/Sub/outputs.npz differ
diff --git a/DeeployTest/test_generic_config.py b/DeeployTest/test_generic_config.py
index b0d8c659ca..eaea3d6400 100644
--- a/DeeployTest/test_generic_config.py
+++ b/DeeployTest/test_generic_config.py
@@ -8,6 +8,10 @@
     "Kernels/FP32/ReLU",
     "Kernels/FP32/Softmax/Regular",
     "Kernels/FP32/Add/Regular",
+    "Kernels/FP32/AveragePool/Regular_1D",
+    "Kernels/FP32/AveragePool/Regular_2D",
+    "Kernels/FP32/Ceil",
+    "Kernels/FP32/Clip",
     "Kernels/FP32/Conv/DW_2D_Bias",
     "Kernels/FP32/Conv/DW_2D_NoBias",
     "Kernels/FP32/Conv/DW_2D_ZeroValuedBias",
@@ -15,7 +19,15 @@
     "Kernels/FP32/Conv/Regular_2D_NoBias",
     "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias",
     "Kernels/FP32/Div",
+    "Kernels/FP32/Exp",
+    "Kernels/FP32/Floor",
     "Kernels/FP32/GEMM/Regular",
+    "Kernels/FP32/GlobalAveragePool",
+    "Kernels/FP32/GlobalMaxPool",
+    "Kernels/FP32/GroupNorm",
+    "Kernels/FP32/HardSigmoid",
+    "Kernels/FP32/HardSwish",
+    "Kernels/FP32/InstanceNorm",
     "Kernels/FP32/MatMul",
     "Kernels/FP32/MaxPool/Regular_1D",
     "Kernels/FP32/MaxPool/Regular_2D",
@@ -43,7 +55,10 @@
     "Kernels/FP32/ReduceMean/NoKeepDims/Axis2",
     "Kernels/FP32/ReduceMean/NoKeepDims/ReduceMean_Add",
     "Kernels/FP32/Reshape/SkipConnection",
+    "Kernels/FP32/Sigmoid",
     "Kernels/FP32/Sqrt",
+    "Kernels/FP32/Sub",
+    "Kernels/FP32/Swish",
     "Kernels/FP32/Transpose",
     # Integer Kernels
     "Kernels/Integer/Softmax/Regular",
@@ -63,6 +78,7 @@
     "Kernels/Integer/ReduceMean",
     "Kernels/Integer/ReduceSum",
     "Kernels/Integer/Slice",
+    "Kernels/Integer/Sub",
     # Special test from TinyViT model layers
     "Models/TinyViT/5M/Layers/FP32/ReduceMean",
     # Mixed Precision / Quantization
diff --git a/TargetLibraries/Generic/inc/DeeployBasicMath.h b/TargetLibraries/Generic/inc/DeeployBasicMath.h
index 22081701a3..2023b9e725 100644
--- a/TargetLibraries/Generic/inc/DeeployBasicMath.h
+++ b/TargetLibraries/Generic/inc/DeeployBasicMath.h
@@ -32,14 +32,24 @@
 #include "types.h"
 #include "utils.h"
 
+#include "kernel/AveragePool.h"
 #include "kernel/BatchNorm.h"
+#include "kernel/Ceil.h"
+#include "kernel/Clip.h"
 #include "kernel/ConvTranspose1d_fp32.h"
 #include "kernel/Convolution.h"
 #include "kernel/DWConvolution.h"
 #include "kernel/Div.h"
+#include "kernel/Exp.h"
+#include "kernel/Floor.h"
 #include "kernel/GELU.h"
 #include "kernel/Gemm.h"
-#include "kernel/Hardswish.h"
+#include "kernel/GlobalAveragePool.h"
+#include "kernel/GlobalMaxPool.h"
+#include "kernel/GroupNorm.h"
+#include "kernel/HardSigmoid.h"
+#include "kernel/HardSwish.h"
+#include "kernel/InstanceNorm.h"
 #include "kernel/Layernorm.h"
 #include "kernel/MatMul.h"
 #include "kernel/MaxPool.h"
@@ -50,7 +60,9 @@
 #include "kernel/RQHardswish.h"
 #include "kernel/Relu.h"
 #include "kernel/RequantShift.h"
+#include "kernel/Sigmoid.h"
 #include "kernel/Softmax.h"
 #include "kernel/Sqrt.h"
+#include "kernel/Swish.h"
 
 #endif //__DEEPLOY_BASIC_MATH_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/AveragePool.h b/TargetLibraries/Generic/inc/kernel/AveragePool.h
new file mode 100644
index 0000000000..2e0c786ffc
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/AveragePool.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_AVERAGEPOOL_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_AVERAGEPOOL_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/******************************************************************************/
+/*                                Average Pool                                */
+/******************************************************************************/
+void AveragePool2d_fp32_fp32(float32_t const *__restrict__ src,
+                             float32_t *__restrict__ dst, uint32_t N,
+                             uint32_t C, uint32_t H, uint32_t W,
+                             uint32_t kernel_h, uint32_t kernel_w,
+                             uint32_t stride_h, uint32_t stride_w,
+                             uint32_t pad_top, uint32_t pad_left,
+                             uint32_t pad_bottom, uint32_t pad_right);
+
+void AveragePool1d_fp32_fp32(float32_t const *__restrict__ src,
+                             float32_t *__restrict__ dst, uint32_t N,
+                             uint32_t C, uint32_t L, uint32_t kernel_len,
+                             uint32_t stride, uint32_t pad_left,
+                             uint32_t pad_right);
+
+#endif //__DEEPLOY_BASIC_MATH_AVERAGEPOOL_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Ceil.h b/TargetLibraries/Generic/inc/kernel/Ceil.h
new file mode 100644
index 0000000000..941b90c75d
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Ceil.h
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_CEIL_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_CEIL_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * element wise ceil operation
+ */
+
+/******************************************************************************/
+/*                              Ceil                                          */
+/******************************************************************************/
+void Ceil_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size);
+
+#endif //__DEEPLOY_BASIC_MATH_CEIL_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Clip.h b/TargetLibraries/Generic/inc/kernel/Clip.h
new file mode 100644
index 0000000000..751c338c03
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Clip.h
@@ -0,0 +1,22 @@
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_CLIP_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_CLIP_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * element wise clip operation
+ */
+
+/******************************************************************************/
+/*                              Ceil                                          */
+/******************************************************************************/
+void Clip_fp32_fp32(float32_t *data_in, float32_t *data_out, float32_t min_val,
+                    float32_t max_val, int32_t size);
+
+#endif //__DEEPLOY_BASIC_MATH_CLIP_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Exp.h b/TargetLibraries/Generic/inc/kernel/Exp.h
new file mode 100644
index 0000000000..330a4c476c
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Exp.h
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_EXP_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_EXP_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * element wise exponential
+ */
+
+/******************************************************************************/
+/*                              Exp                                          */
+/******************************************************************************/
+void Exp_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size);
+
+#endif //__DEEPLOY_BASIC_MATH_EXP_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Floor.h b/TargetLibraries/Generic/inc/kernel/Floor.h
new file mode 100644
index 0000000000..42ef3fd712
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Floor.h
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_FLOOR_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_FLOOR_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * element wise floor operation
+ */
+
+/******************************************************************************/
+/*                              Floor                                         */
+/******************************************************************************/
+void Floor_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size);
+
+#endif //__DEEPLOY_BASIC_MATH_FLOOR_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/GlobalAveragePool.h b/TargetLibraries/Generic/inc/kernel/GlobalAveragePool.h
new file mode 100644
index 0000000000..a64484189e
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/GlobalAveragePool.h
@@ -0,0 +1,19 @@
+/*
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_GLOBALAVERAGEPOOL_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_GLOBALAVERAGEPOOL_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/******************************************************************************/
+/*                                Average Pool                                */
+/******************************************************************************/
+void GlobalAveragePool_fp32_fp32(float32_t const *__restrict__ src,
+                                 float32_t *__restrict__ dst, uint32_t N,
+                                 uint32_t C, uint32_t spatial_size);
+
+#endif //__DEEPLOY_BASIC_MATH_GLOBALAVERAGEPOOL_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/GlobalMaxPool.h b/TargetLibraries/Generic/inc/kernel/GlobalMaxPool.h
new file mode 100644
index 0000000000..030b26211e
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/GlobalMaxPool.h
@@ -0,0 +1,19 @@
+/*
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_GLOBALAVERAGEPOOL_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_GLOBALAVERAGEPOOL_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/******************************************************************************/
+/*                                Average Pool                                */
+/******************************************************************************/
+void GlobalMaxPool_fp32_fp32(float32_t const *__restrict__ src,
+                             float32_t *__restrict__ dst, uint32_t N,
+                             uint32_t C, uint32_t spatial_size);
+
+#endif //__DEEPLOY_BASIC_MATH_GLOBALAVERAGEPOOL_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/GroupNorm.h b/TargetLibraries/Generic/inc/kernel/GroupNorm.h
new file mode 100644
index 0000000000..02ca5a2e57
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/GroupNorm.h
@@ -0,0 +1,23 @@
+/*
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_GROUPNORM_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_GROUPNORM_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/******************************************************************************/
+/*                          Group Normalization                               */
+/******************************************************************************/
+void GroupNormalization_fp32_fp32(const float32_t *__restrict__ src,
+                                  float32_t *__restrict__ dst,
+                                  const float32_t *__restrict__ scale,
+                                  const float32_t *__restrict__ bias,
+                                  uint32_t batch_size, uint32_t num_channels,
+                                  uint32_t spatial, uint32_t num_groups,
+                                  float32_t epsilon);
+
+#endif //__DEEPLOY_BASIC_MATH_GROUPNORM_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/HardSigmoid.h b/TargetLibraries/Generic/inc/kernel/HardSigmoid.h
new file mode 100644
index 0000000000..542689eb33
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/HardSigmoid.h
@@ -0,0 +1,22 @@
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_HARDSIGMOID_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_HARDSIGMOID_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * element wise hard-sigmoid
+ */
+
+/******************************************************************************/
+/*                          HardSigmoid                                       */
+/******************************************************************************/
+void HardSigmoid_fp32_fp32(float32_t *data_in, float32_t *data_out,
+                           float32_t alpha, float32_t beta, int32_t size);
+
+#endif //__DEEPLOY_BASIC_MATH_HARDSIGMOID_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Hardswish.h b/TargetLibraries/Generic/inc/kernel/HardSwish.h
similarity index 69%
rename from TargetLibraries/Generic/inc/kernel/Hardswish.h
rename to TargetLibraries/Generic/inc/kernel/HardSwish.h
index e0df42efbb..51e891622e 100644
--- a/TargetLibraries/Generic/inc/kernel/Hardswish.h
+++ b/TargetLibraries/Generic/inc/kernel/HardSwish.h
@@ -17,4 +17,10 @@ void iHardswish_s8_s32(int8_t *input, int32_t *output, int32_t size,
                        int32_t one_over_six, int32_t three, int32_t six,
                        int32_t input_offset);
 
+/******************************************************************************/
+/*                             Hardswish (fp32)                               */
+/******************************************************************************/
+
+void HardSwish_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size);
+
 #endif // __DEEPLOY_BASIC_MATH_HARDSWISH_KERNEL_HEADER_
\ No newline at end of file
diff --git a/TargetLibraries/Generic/inc/kernel/InstanceNorm.h b/TargetLibraries/Generic/inc/kernel/InstanceNorm.h
new file mode 100644
index 0000000000..dd68b0cadb
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/InstanceNorm.h
@@ -0,0 +1,22 @@
+/*
+ * SPDX-FileCopyrightText: 2023 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_INSTANCENORM_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_INSTANCENORM_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/******************************************************************************/
+/*                         Instance Normalization                             */
+/******************************************************************************/
+void InstanceNormalization_fp32_fp32(const float32_t *__restrict__ src,
+                                     float32_t *__restrict__ dst,
+                                     const float32_t *__restrict__ scale,
+                                     const float32_t *__restrict__ bias,
+                                     uint32_t batch_size, uint32_t num_channels,
+                                     uint32_t spatial, float32_t epsilon);
+
+#endif //__DEEPLOY_BASIC_MATH_INSTANCENORM_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Sigmoid.h b/TargetLibraries/Generic/inc/kernel/Sigmoid.h
new file mode 100644
index 0000000000..d9a960cab3
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Sigmoid.h
@@ -0,0 +1,21 @@
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_SIGMOID_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_SIGMOID_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * element wise sigmoid
+ */
+
+/******************************************************************************/
+/*                             Sigmoid                                        */
+/******************************************************************************/
+void Sigmoid_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size);
+
+#endif //__DEEPLOY_BASIC_MATH_SIGMOID_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/inc/kernel/Swish.h b/TargetLibraries/Generic/inc/kernel/Swish.h
new file mode 100644
index 0000000000..326f7822c8
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/Swish.h
@@ -0,0 +1,22 @@
+/*
+ * SPDX-FileCopyrightText: 2020 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_SWISH_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_SWISH_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+/*
+ * element wise swish
+ */
+
+/******************************************************************************/
+/*                              Swish                                         */
+/******************************************************************************/
+void Swish_fp32_fp32(float32_t *data_in, float32_t *data_out, float alpha,
+                     int32_t size);
+
+#endif //__DEEPLOY_BASIC_MATH_SWISH_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/src/AveragePool_fp32.c b/TargetLibraries/Generic/src/AveragePool_fp32.c
new file mode 100644
index 0000000000..6c17a8a49e
--- /dev/null
+++ b/TargetLibraries/Generic/src/AveragePool_fp32.c
@@ -0,0 +1,89 @@
+/*
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+
+void AveragePool2d_fp32_fp32(float32_t const *__restrict__ src,
+                             float32_t *__restrict__ dst, uint32_t N,
+                             uint32_t C, uint32_t H, uint32_t W,
+                             uint32_t kernel_h, uint32_t kernel_w,
+                             uint32_t stride_h, uint32_t stride_w,
+                             uint32_t pad_top, uint32_t pad_left,
+                             uint32_t pad_bottom, uint32_t pad_right) {
+
+  if (N == 0 || C == 0 || stride_h == 0 || stride_w == 0 ||
+      (H + pad_top + pad_bottom) < kernel_h ||
+      (W + pad_left + pad_right) < kernel_w) {
+    return;
+  }
+
+  uint32_t H_out = (H + pad_top + pad_bottom - kernel_h) / stride_h + 1;
+  uint32_t W_out = (W + pad_left + pad_right - kernel_w) / stride_w + 1;
+
+  for (uint32_t n = 0; n < N; ++n) {
+    for (uint32_t c = 0; c < C; ++c) {
+      for (uint32_t h_out = 0; h_out < H_out; h_out++) {
+        for (uint32_t w_out = 0; w_out < W_out; w_out++) {
+
+          float32_t sum = 0.0f;
+          uint32_t count = 0;
+
+          for (uint32_t kh = 0; kh < kernel_h; kh++) {
+            for (uint32_t kw = 0; kw < kernel_w; kw++) {
+
+              int32_t h_in = (int32_t)(h_out * stride_h + kh) - pad_top;
+              int32_t w_in = (int32_t)(w_out * stride_w + kw) - pad_left;
+
+              if (h_in >= 0 && h_in < (int32_t)H && w_in >= 0 &&
+                  w_in < (int32_t)W) {
+                sum += src[((n * C + c) * H + h_in) * W + w_in];
+                count++;
+              }
+            }
+          }
+          uint32_t idx = ((n * C + c) * H_out + h_out) * W_out + w_out;
+          dst[idx] = sum / (float32_t)count;
+        }
+      }
+    }
+  }
+}
+
+void AveragePool1d_fp32_fp32(float32_t const *__restrict__ src,
+                             float32_t *__restrict__ dst, uint32_t N,
+                             uint32_t C, uint32_t L, uint32_t kernel_len,
+                             uint32_t stride, uint32_t pad_left,
+                             uint32_t pad_right) {
+
+  if (N == 0 || C == 0 || stride == 0 ||
+      (L + pad_left + pad_right) < kernel_len) {
+    return;
+  }
+
+  uint32_t L_out = (L + pad_left + pad_right - kernel_len) / stride + 1;
+
+  for (uint32_t n = 0; n < N; ++n) {
+    for (uint32_t c = 0; c < C; ++c) {
+      for (uint32_t l_out = 0; l_out < L_out; l_out++) {
+
+        float32_t sum = 0.0f;
+        uint32_t count = 0;
+
+        for (uint32_t k = 0; k < kernel_len; k++) {
+
+          int32_t l_in = (int32_t)(l_out * stride + k) - (int32_t)pad_left;
+
+          if (l_in >= 0 && l_in < (int32_t)L) {
+            sum += src[(n * C + c) * L + l_in];
+            count++;
+          }
+        }
+        uint32_t i = (n * C + c) * L_out + l_out;
+        dst[i] = (count == 0) ? 0.0f : (sum / (float32_t)count);
+      }
+    }
+  }
+}
\ No newline at end of file
diff --git a/TargetLibraries/Generic/src/Ceil_fp32.c b/TargetLibraries/Generic/src/Ceil_fp32.c
new file mode 100644
index 0000000000..fe73e20637
--- /dev/null
+++ b/TargetLibraries/Generic/src/Ceil_fp32.c
@@ -0,0 +1,14 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void Ceil_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size) {
+  for (int i = 0; i < size; i++) {
+    data_out[i] = ceilf(data_in[i]);
+  }
+}
diff --git a/TargetLibraries/Generic/src/Clip_fp32.c b/TargetLibraries/Generic/src/Clip_fp32.c
new file mode 100644
index 0000000000..092fbf7b60
--- /dev/null
+++ b/TargetLibraries/Generic/src/Clip_fp32.c
@@ -0,0 +1,15 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void Clip_fp32_fp32(float32_t *data_in, float32_t *data_out, float32_t min_val,
+                    float32_t max_val, int32_t size) {
+  for (int i = 0; i < size; i++) {
+    data_out[i] = fmaxf(min_val, fminf(max_val, data_in[i]));
+  }
+}
diff --git a/TargetLibraries/Generic/src/Exp_fp32.c b/TargetLibraries/Generic/src/Exp_fp32.c
new file mode 100644
index 0000000000..6dfdeb52db
--- /dev/null
+++ b/TargetLibraries/Generic/src/Exp_fp32.c
@@ -0,0 +1,14 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void Exp_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size) {
+  for (int i = 0; i < size; i++) {
+    data_out[i] = expf(data_in[i]);
+  }
+}
diff --git a/TargetLibraries/Generic/src/Floor_fp32.c b/TargetLibraries/Generic/src/Floor_fp32.c
new file mode 100644
index 0000000000..2618e516fa
--- /dev/null
+++ b/TargetLibraries/Generic/src/Floor_fp32.c
@@ -0,0 +1,14 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void Floor_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size) {
+  for (int i = 0; i < size; i++) {
+    data_out[i] = floorf(data_in[i]);
+  }
+}
diff --git a/TargetLibraries/Generic/src/GlobalAveragePool_fp32.c b/TargetLibraries/Generic/src/GlobalAveragePool_fp32.c
new file mode 100644
index 0000000000..72c2c08aa0
--- /dev/null
+++ b/TargetLibraries/Generic/src/GlobalAveragePool_fp32.c
@@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+
+void GlobalAveragePool_fp32_fp32(float32_t const *__restrict__ src,
+                                 float32_t *__restrict__ dst, uint32_t N,
+                                 uint32_t C, uint32_t spatial_size) {
+
+  if (spatial_size == 0) {
+    return; // invalid shape for average pooling; avoid divide-by-zero
+  }
+  for (uint32_t n = 0; n < N; ++n) {
+    for (uint32_t c = 0; c < C; ++c) {
+
+      float32_t sum = 0.0f;
+      const float32_t *x = src + (n * C + c) * spatial_size;
+
+      for (uint32_t i = 0; i < spatial_size; ++i) {
+        sum += x[i];
+      }
+      dst[n * C + c] = sum / spatial_size;
+    }
+  }
+}
\ No newline at end of file
diff --git a/TargetLibraries/Generic/src/GlobalMaxPool_fp32.c b/TargetLibraries/Generic/src/GlobalMaxPool_fp32.c
new file mode 100644
index 0000000000..db4fddac43
--- /dev/null
+++ b/TargetLibraries/Generic/src/GlobalMaxPool_fp32.c
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: 2022 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+
+void GlobalMaxPool_fp32_fp32(float32_t const *__restrict__ src,
+                             float32_t *__restrict__ dst, uint32_t N,
+                             uint32_t C, uint32_t spatial_size) {
+
+  for (uint32_t n = 0; n < N; n++) {
+    for (uint32_t c = 0; c < C; c++) {
+
+      float32_t sum = 0.0f;
+      const float32_t *x = src + (n * C + c) * spatial_size;
+
+      float32_t max = x[0];
+      for (uint32_t i = 1; i < spatial_size; i++) {
+        if (x[i] > max) {
+          max = x[i];
+        }
+      }
+
+      dst[n * C + c] = max;
+    }
+  }
+}
\ No newline at end of file
diff --git a/TargetLibraries/Generic/src/GroupNormalization_fp32.c b/TargetLibraries/Generic/src/GroupNormalization_fp32.c
new file mode 100644
index 0000000000..24fbf66a72
--- /dev/null
+++ b/TargetLibraries/Generic/src/GroupNormalization_fp32.c
@@ -0,0 +1,64 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void GroupNormalization_fp32_fp32(
+    const float32_t *__restrict__ src, float32_t *__restrict__ dst,
+    const float32_t *__restrict__ scale, const float32_t *__restrict__ bias,
+    uint32_t batch_size, uint32_t num_channels,
+    uint32_t spatial, // spatial dimension (L or H*W or D*H*W, etc.)
+    uint32_t num_groups, float32_t epsilon) {
+
+  if (num_groups == 0 || spatial == 0 || (num_channels % num_groups) != 0) {
+    return;
+  }
+  uint32_t channels_per_group = num_channels / num_groups;
+  uint32_t group_elements = channels_per_group * spatial;
+  if (group_elements == 0) {
+    return;
+  }
+  uint32_t slice = num_channels * spatial; // elements per batch
+
+  for (uint32_t n = 0; n < batch_size; ++n) {
+    for (uint32_t g = 0; g < num_groups; ++g) {
+      uint32_t group_offset = n * slice + g * group_elements;
+      const float32_t *x_group = src + group_offset;
+
+      /* --- mean --- */
+      float64_t sum = 0.0;
+      for (uint32_t i = 0; i < group_elements; ++i) {
+        sum += x_group[i];
+      }
+      float64_t mean = sum / (float32_t)group_elements;
+
+      /* --- variance --- */
+      float64_t var = 0.0;
+      for (uint32_t i = 0; i < group_elements; ++i) {
+        float64_t d = (float64_t)x_group[i] - mean;
+        var += d * d;
+      }
+      var /= (float64_t)group_elements;
+
+      /* --- normalize + affine --- */
+      float32_t inv_std = (float32_t)(1.0 / sqrt(var + (float64_t)epsilon));
+      float32_t m = (float32_t)mean;
+
+      for (uint32_t lc = 0; lc < channels_per_group; ++lc) {
+        const float32_t *x_channel = x_group + lc * spatial;
+        float32_t *y_channel = dst + group_offset + lc * spatial;
+        uint32_t c = g * channels_per_group + lc; // global channel
+        float32_t s = scale[c];
+        float32_t b = bias[c];
+
+        for (uint32_t i = 0; i < spatial; ++i) {
+          y_channel[i] = s * (x_channel[i] - m) * inv_std + b;
+        }
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/HardSigmoid_fp32.c b/TargetLibraries/Generic/src/HardSigmoid_fp32.c
new file mode 100644
index 0000000000..a436e3f1d8
--- /dev/null
+++ b/TargetLibraries/Generic/src/HardSigmoid_fp32.c
@@ -0,0 +1,15 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void HardSigmoid_fp32_fp32(float32_t *data_in, float32_t *data_out,
+                           float32_t alpha, float32_t beta, int32_t size) {
+  for (int i = 0; i < size; i++) {
+    data_out[i] = fmaxf(0, fminf(1, alpha * data_in[i] + beta));
+  }
+}
diff --git a/TargetLibraries/Generic/src/HardSwish_fp32.c b/TargetLibraries/Generic/src/HardSwish_fp32.c
new file mode 100644
index 0000000000..41e4f424b4
--- /dev/null
+++ b/TargetLibraries/Generic/src/HardSwish_fp32.c
@@ -0,0 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void HardSwish_fp32_fp32(float32_t *data_in, float32_t *data_out,
+                         int32_t size) {
+  for (int i = 0; i < size; i++) {
+    float32_t x = data_in[i];
+    data_out[i] = x * fmaxf(0, fminf(1, x / 6 + 0.5));
+  }
+}
diff --git a/TargetLibraries/Generic/src/InstanceNormalization_fp32.c b/TargetLibraries/Generic/src/InstanceNormalization_fp32.c
new file mode 100644
index 0000000000..23405dc11d
--- /dev/null
+++ b/TargetLibraries/Generic/src/InstanceNormalization_fp32.c
@@ -0,0 +1,54 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void InstanceNormalization_fp32_fp32(
+    const float32_t *__restrict__ src, float32_t *__restrict__ dst,
+    const float32_t *__restrict__ scale, const float32_t *__restrict__ bias,
+    uint32_t batch_size, uint32_t num_channels,
+    uint32_t spatial, // spatial dimension (L or H*W or D*H*W, etc.)
+    float32_t epsilon) {
+
+  if (spatial == 0) {
+    return;
+  }
+
+  uint32_t slice = num_channels * spatial; // elements per batch
+
+  for (uint32_t n = 0; n < batch_size; ++n) {
+    for (uint32_t c = 0; c < num_channels; ++c) {
+      uint32_t channel_offset = n * slice + c * spatial;
+      const float32_t *x = src + channel_offset;
+      float32_t *y = dst + channel_offset;
+
+      /* --- mean --- */
+      float64_t sum = 0.0;
+      for (uint32_t i = 0; i < spatial; ++i)
+        sum += x[i];
+      float64_t mean = sum / (float32_t)spatial;
+
+      /* --- variance --- */
+      float64_t var = 0.0;
+      for (uint32_t i = 0; i < spatial; ++i) {
+        float64_t d = (float64_t)x[i] - mean;
+        var += d * d;
+      }
+      var /= (float64_t)spatial;
+
+      /* --- normalize + affine --- */
+      float32_t inv_std = (float32_t)(1.0 / sqrt(var + (float64_t)epsilon));
+      float32_t g = scale[c];
+      float32_t b = bias[c];
+      float32_t m = (float32_t)mean;
+
+      for (size_t i = 0; i < spatial; ++i) {
+        y[i] = g * (x[i] - m) * inv_std + b;
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/Sigmoid_fp32.c b/TargetLibraries/Generic/src/Sigmoid_fp32.c
new file mode 100644
index 0000000000..1c98bdfc6f
--- /dev/null
+++ b/TargetLibraries/Generic/src/Sigmoid_fp32.c
@@ -0,0 +1,14 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void Sigmoid_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t size) {
+  for (int i = 0; i < size; i++) {
+    data_out[i] = 1 / (1 + expf(-data_in[i]));
+  }
+}
diff --git a/TargetLibraries/Generic/src/Swish_fp32.c b/TargetLibraries/Generic/src/Swish_fp32.c
new file mode 100644
index 0000000000..5447de4c6a
--- /dev/null
+++ b/TargetLibraries/Generic/src/Swish_fp32.c
@@ -0,0 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: 2025 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void Swish_fp32_fp32(float32_t *data_in, float32_t *data_out, float alpha,
+                     int32_t size) {
+  for (int i = 0; i < size; i++) {
+    float32_t x = data_in[i];
+    data_out[i] = x / (1 + expf(-alpha * x));
+  }
+}