diff --git a/.github/workflows/BuildDocker.yml b/.github/workflows/BuildDocker.yml index 01dd15ef32..6e0ac03a3c 100644 --- a/.github/workflows/BuildDocker.yml +++ b/.github/workflows/BuildDocker.yml @@ -38,4 +38,4 @@ jobs: file: Container/Dockerfile push: true # JUNGVI: If you operate from a fork and want to build a new docker make sure to replace 'pulp-platform' by your uname. - tags: ghcr.io/pulp-platform/deeploy:main + tags: ghcr.io/runwangdl/deeploy:redmule diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 7d355c822b..6ed9866c5f 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -9,7 +9,7 @@ on: - cron: "0 1 */6 * *" env: - DOCKER_IMAGE: ghcr.io/pulp-platform/deeploy:main + DOCKER_IMAGE: ghcr.io/runwangdl/deeploy:redmule jobs: @@ -338,7 +338,7 @@ jobs: }, { "name": "testFloat2DConvolution", - "L1": [2000] + "L1": [8000] }, { "name": "testFloatLayerNorm", @@ -420,7 +420,7 @@ jobs: }, { "name": "testFloat2DConvolution", - "L1": [4000] + "L1": [15000] }, { "name": "testFloatLayerNorm", @@ -514,12 +514,8 @@ jobs: L1: [64000] - name: "CCT/CCT_1_16_16_64" L1: [64000] - - name: "CCT/CCT_1_16_16_128" - L1: [64000] - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_64" L1: [64000] - - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_128" - L1: [64000] num-cores: - 8 default-memory-level: @@ -559,12 +555,8 @@ jobs: L1: [64000] - name: "CCT/CCT_1_16_16_64" L1: [64000] - - name: "CCT/CCT_1_16_16_128" - L1: [64000] - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_64" L1: [64000] - - name: "testTrainCCT/CCT_Classifier_Training/CCT_1_16_16_128" - L1: [64000] num-cores: - 8 double-buffer: @@ -748,6 +740,42 @@ jobs: default-memory-level: ${{ matrix.default-memory-level }} neureka-wmem: ${{ matrix.neureka-wmem }} + siracusa-redmule-kernels-tiled-singlebuffer-L2: + strategy: + fail-fast: false + matrix: + test-data: + - name: "testFloatMatmul" + L1: [8000] + num-cores: + - 8 + uses: ./.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml + needs: select-docker-image + with: + docker-image: ${{ needs.select-docker-image.outputs.image }} + test-name: ${{ matrix.test-data.name }} + num-cores: ${{ matrix.num-cores }} + L1: ${{ toJson(matrix.test-data.L1) }} + + siracusa-redmule-kernels-tiled-doublebuffer-L2: + strategy: + fail-fast: false + matrix: + test-data: + - name: "testFloatMatmul" + L1: [8000] + num-cores: + - 8 + double-buffer: + - true + uses: ./.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml + needs: select-docker-image + with: + docker-image: ${{ needs.select-docker-image.outputs.image }} + test-name: ${{ matrix.test-data.name }} + num-cores: ${{ matrix.num-cores }} + L1: ${{ toJson(matrix.test-data.L1) }} + double-buffer: ${{ matrix.double-buffer }} ### Deeploy Extension and Internal Tests ### deeploy-memory-allocation: diff --git a/.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml b/.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml new file mode 100644 index 0000000000..e12c4b675c --- /dev/null +++ b/.github/workflows/TestRunnerTiledSiracusaWithRedmule.yml @@ -0,0 +1,72 @@ +name: TestRunnerTiledSiracusa + +on: + workflow_call: + inputs: + docker-image: + required: true + type: string + test-name: + required: true + type: string + num-cores: + required: false + default: 8 + type: number + L1: + required: false + default: "[64000]" + type: string + default-memory-level: + required: false + default: "L2" + type: string + double-buffer: + required: false + default: false + type: boolean + memory-allocation-strategy: + required: false + default: "MiniMalloc" + type: string + search-strategy: + required: false + default: "random-max" + type: string + +jobs: + + test-runner-siracusa-tiled: + strategy: + fail-fast: false + matrix: + L1: ${{ fromJSON(inputs.L1) }} + runs-on: ubuntu-22.04 + container: + image: ${{ inputs.docker-image }} + steps: + - name: Checkout Repo + uses: actions/checkout@v4 + with: + submodules: recursive + - name: Build Deeploy + run: pip install -e . + - name: Cache ccache + id: ccache-cache + uses: actions/cache@v4 + with: + path: /app/.ccache + key: ${{ runner.os }}-ccache + - name: Run Test + uses: nick-fields/retry@v3 + with: + timeout_minutes: 15 + max_attempts: 3 + retry_on: timeout + command: | + cd DeeployTest + mkdir -p /app/.ccache + export CCACHE_DIR=/app/.ccache + python testRunner_tiled_siracusa_w_redmule.py -t Tests/${{ inputs.test-name }} --cores=${{ inputs.num-cores }} --l1 ${{ matrix.L1 }} --defaultMemLevel=${{ inputs.default-memory-level }} ${{ inputs.double-buffer && '--doublebuffer' || '' }} --memAllocStrategy=${{ inputs.memory-allocation-strategy }} --searchStrategy=${{ inputs.search-strategy }} + shell: bash + \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index f14ed74461..6f8381d29b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -282,4 +282,18 @@ Change main.c to use OUTPUTTYPE instead of float ### Changed - The ISA for the Siracusa platform has been updated from rv32imc_zfinx_xpulpv2 to rv32imf_xpulpv2. -- All floating-point comparison tasks in deeploytest.c are now offloaded to Cluster 0 for execution. \ No newline at end of file +- All floating-point comparison tasks in deeploytest.c are now offloaded to Cluster 0 for execution. + +## Add RV32IMF Picolibc support for Siracusa platform + +## Added +- Adds RV32IMF Picolib to the toolchain + +## Parallelization and Optimization of CCT Inference and Training Kernels + +### Added +- Parallel Matmul, Softmax, Gelu, Conv, Layernorm, Maxpool, Add +- Gelu with sigmoid approximation +- Im2col Conv +- Matmul with pulptrainlib with 1*7 unrolling performance aligned with pulptrainlib +- Compute op support for multiple float kernels: Maxpool, Relu, Mul diff --git a/CMakeLists.txt b/CMakeLists.txt index b23293dd55..dbdfb86409 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,8 +15,8 @@ if(TOOLCHAIN STREQUAL GCC) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() -set(platform MemPool CACHE STRING "Platform (MemPool, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, Generic, Snitch)") -set_property(CACHE platform PROPERTY STRINGS MemPool QEMU Siracusa Siracusa_w_neureka PULP-Open Generic Snitch) +set(platform MemPool CACHE STRING "Platform (MemPool, QEMU, Siracusa, Siracusa_w_neureka, Siracusa_w_redmule, PULP-Open, Generic, Snitch)") +set_property(CACHE platform PROPERTY STRINGS MemPool QEMU Siracusa Siracusa_w_neureka Siracusa_w_redmule PULP-Open Generic Snitch) if(platform STREQUAL MemPool) message(STATUS "Building for platform 'MemPool'") @@ -26,6 +26,8 @@ elseif(platform STREQUAL Siracusa) message(STATUS "Building for platform 'Siracusa'") elseif(platform STREQUAL Siracusa_w_neureka) message(STATUS "Building for platform 'Siracusa_w_neureka'") +elseif(platform STREQUAL Siracusa_w_redmule) + message(STATUS "Building for platform 'Siracusa_w_redmule'") elseif(platform STREQUAL PULPOpen) message(STATUS "Building for platform 'PULP-Open'") elseif(platform STREQUAL Generic) @@ -148,7 +150,7 @@ if(platform STREQUAL QEMU-ARM) endif() -if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL PULPOpen) +if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule OR platform STREQUAL PULPOpen) if(TOOLCHAIN STREQUAL LLVM) set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/toolchain_llvm.cmake) @@ -158,7 +160,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp.cmake) - if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka) + if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule) include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/siracusa/siracusa.cmake) elseif(platform STREQUAL PULPOpen) include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp-open/pulp-open.cmake) diff --git a/Container/Dockerfile b/Container/Dockerfile index ce77db92ad..2d0a78c78f 100644 --- a/Container/Dockerfile +++ b/Container/Dockerfile @@ -42,7 +42,9 @@ RUN DEBIAN_FRONTEND=noninteractive apt-get install -y git-lfs \ libsdl2-ttf-dev \ gcc-multilib \ wget \ - clang-format + clang-format \ + libxtensor-dev \ + libxsimd-dev # Install cmake 3.31.1 RUN wget https://github.com/Kitware/CMake/releases/download/v3.31.1/cmake-3.31.1-linux-x86_64.sh && \ diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index b7249c5e83..5fe3c389bd 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -69,15 +69,16 @@ def __init__(self, maps: List[NodeMapper]): super().__init__(maps) def computeOps(self): - compAbs = self.mapper.parser.operatorRepresentation['size'] - compAdd = self.mapper.parser.operatorRepresentation['size'] - compSqr = self.mapper.parser.operatorRepresentation['size'] - compMul = self.mapper.parser.operatorRepresentation['size'] - compAdd = self.mapper.parser.operatorRepresentation['size'] - compMul2 = self.mapper.parser.operatorRepresentation['size'] - compAdd2 = self.mapper.parser.operatorRepresentation['size'] - compDiv = self.mapper.parser.operatorRepresentation['size'] - return compAbs + compAdd + compSqr + compMul + compAdd + compMul2 + compAdd2 + compDiv + size = self.mapper.parser.operatorRepresentation['size'] + # RW: Sigmoid approximation + mul1 = size # Multiply by 1.702 + neg = size # Negate the result + exp = size # Compute exponential + add = size # Add 1 + div = size # Division for sigmoid + mul2 = size # Final multiplication by x + + return mul1 + neg + exp + add + div + mul2 class iHardswishLayer(ONNXLayer): @@ -120,12 +121,39 @@ class SoftmaxLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): super().__init__(maps) + def computeOps(self): + + size = self.mapper.parser.operatorRepresentation['size'] + last_dim_length = self.mapper.parser.operatorRepresentation['lastDimLength'] + batch_size = size // last_dim_length + + max_ops = last_dim_length - 1 + exp_ops = last_dim_length * 2 + sum_ops = last_dim_length - 1 + div_ops = last_dim_length + ops_per_batch = max_ops + exp_ops + sum_ops + div_ops + total_ops = ops_per_batch * batch_size + + return total_ops + class SoftmaxGradLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): super().__init__(maps) + def computeOps(self): + input_size = self.mapper.parser.operatorRepresentation['size'] + + # SoftmaxGrad operation: dy * (y - (y * sum(dy * y))) + mul_ops = input_size + sum_ops = input_size + broadcast_mul_ops = input_size + sub_ops = input_size + final_mul_ops = input_size + + return mul_ops + sum_ops + broadcast_mul_ops + sub_ops + final_mul_ops + class ITAMaxLayer(ONNXLayer): @@ -252,7 +280,7 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese N = inputShapes[1][-1] if len(inputShapes) == 3: - inputShapes[2] = [M, N] + inputShapes[2] = outputShapes[0] return (inputShapes, outputShapes) @@ -317,6 +345,9 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese inputShapes[0] = inputShapes[1] return (inputShapes, outputShapes) + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + class ConvLayer(ONNXLayer): @@ -374,6 +405,14 @@ class MaxPoolLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): super().__init__(maps) + def computeOps(self): + kernel_shape = self.mapper.parser.operatorRepresentation['kernel_shape'] + elements_per_window = int(np.prod(kernel_shape)) + data_out_size = self.mapper.parser.operatorRepresentation['data_out_size'] + comparisons_per_window = elements_per_window - 1 + total_ops = data_out_size * comparisons_per_window + return total_ops + class ReduceMeanLayer(ONNXLayer): @@ -403,6 +442,9 @@ class ReluLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): super().__init__(maps) + def computeOps(self): + return self.mapper.parser.operatorRepresentation['size'] + class LayerNormLayer(ONNXLayer): diff --git a/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py b/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py index 711436b7a1..7b011d76d5 100644 --- a/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatGELUTemplate.py @@ -1,12 +1,12 @@ # ---------------------------------------------------------------------- # -# File: iGELUTemplate.py +# File: FloatGELUTemplate.py # -# Last edited: 13.12.2021 +# Last edited: 28.03.2025 # # Copyright (C) 2021, ETH Zurich and University of Bologna. # -# Author: Moritz Scherer, ETH Zurich +# Author: Run Wang, ETH Zurich # # ---------------------------------------------------------------------- # SPDX-License-Identifier: Apache-2.0 @@ -28,4 +28,4 @@ referenceTemplate = NodeTemplate(""" // GELU (Name: ${nodeName}, Op: ${nodeOp}) SINGLE_CORE GELU_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}); -""") +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 8fc4d9d97b..a8ba45ed46 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -37,23 +37,23 @@ from Deeploy.DeeployTypes import CodeTransformation, NodeBinding, NodeTemplate from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration -from Deeploy.Targets.Generic.Templates import ConcatTemplate, DequantTemplate, FloatGELUTemplate, FloatGemmTemplate, \ - FloatLayernormTemplate, FloatMatMulTemplate, FloatMulTemplate, FloatReduceSumTemplate, FloatReluTemplate, \ - FloatSoftmaxTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, iHardswishTemplate -from Deeploy.Targets.Generic.TypeCheckers import ConcatChecker, ConvChecker, DequantChecker, GatherChecker, \ - GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, QuantChecker, \ - ReduceMeanChecker, ReluChecker, RQAddChecker, RQHardswishChecker, SGDChecker, SliceChecker, SoftmaxChecker, \ - SoftmaxCrossEntropyLossChecker, TransposeChecker +from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatGemmTemplate, \ + FloatMulTemplate, FloatReduceSumTemplate, FloatSoftmaxTemplate, GatherTemplate, QuantTemplate, RQSiGELUTemplate, \ + iHardswishTemplate +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ + GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ + QuantChecker, ReduceMeanChecker, ReluChecker, RQAddChecker, RQHardswishChecker, SGDChecker, SliceChecker, \ + SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPProfileUntiled import PULPProfileUntiled from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture -from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatConvTemplate, FloatMaxPoolTemplate, \ - FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, \ - RequantShiftTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SliceTemplate, \ - SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ - iRMSNormTemplate, iSoftmaxTemplate +from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, FloatAddTemplate, FloatConvTemplate, FloatGELUTemplate, \ + FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatReluTemplate, FloatSoftmaxTemplate, \ + GEMMTemplate, MatrixVectorTemplate, MaxPool2DTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, \ + RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SliceTemplate, SoftmaxCrossEntropyLossTemplate, \ + TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ PULPRequantShiftChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement @@ -172,6 +172,16 @@ for _type3 in [int8_t, uint8_t] ] +PULPAddBindings = [ + NodeBinding(AddChecker([PointerClass(type1), PointerClass(type2)], [PointerClass(int32_t)]), + AddTemplate.referenceTemplate, ForkTransformer) + for type1 in IntegerDataTypes + for type2 in IntegerDataTypes +] + [ + NodeBinding(AddChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatAddTemplate.referenceTemplate, ForkTransformer) +] + PULPRQSConv2DBindings = [ NodeBinding( PULPConvChecker([ @@ -215,7 +225,7 @@ PULPFloatConv2DBindings = [ NodeBinding( ConvChecker([PointerClass(float32_t), PointerClass(float32_t), - PointerClass(float32_t)], [PointerClass(float32_t)]), FloatConvTemplate.reference2DTemplate, + PointerClass(float32_t)], [PointerClass(float32_t)]), FloatConvTemplate.reference2DIm2ColTemplate, ForkTransformer) ] @@ -264,7 +274,7 @@ GEMMTemplate.PULPMM_8_Template, ClusterTransformer) ] + [ NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), - FloatMatMulTemplate.referenceTemplate, ClusterTransformer) + FloatMatMulTemplate.referenceTemplate, ForkTransformer) ] PULPReduceMeanBindings = [ diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py new file mode 100644 index 0000000000..850de69e55 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatAddTemplate.py @@ -0,0 +1,50 @@ +# ---------------------------------------------------------------------- +# +# File: FloatAddTemplate.py +# +# Last edited: 13.11.2024 +# +# Copyright (C) 2021, ETH Zurich and University of Bologna. +# +# Authors: +# - Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// Add Parallel with 1x6 unrolling (Name: ${nodeName}, Op: ${nodeOp}) +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size}); +int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size}); + +uint32_t i = ${nodeName}_chunk_start; +for (; i+5 < ${nodeName}_chunk_stop; i+=6) { + ${data_out}[i] = ${data_in_1}[i] + ${data_in_2}[i]; + ${data_out}[i+1] = ${data_in_1}[i+1] + ${data_in_2}[i+1]; + ${data_out}[i+2] = ${data_in_1}[i+2] + ${data_in_2}[i+2]; + ${data_out}[i+3] = ${data_in_1}[i+3] + ${data_in_2}[i+3]; + ${data_out}[i+4] = ${data_in_1}[i+4] + ${data_in_2}[i+4]; + ${data_out}[i+5] = ${data_in_1}[i+5] + ${data_in_2}[i+5]; +} + +for (; i < ${nodeName}_chunk_stop; i++) { + ${data_out}[i] = ${data_in_1}[i] + ${data_in_2}[i]; +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py index 9e8ec57643..0a368fd413 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py @@ -23,26 +23,115 @@ # See the License for the specific language governing permissions and # limitations under the License. -from Deeploy.DeeployTypes import NodeTemplate +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class PULP2DFloatConvIm2ColTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + im2col_dim = 4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] * + operatorRepresentation['dim_kernel_y']) + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + return [(im2col_name, im2col_dim)] + + def hoistTransientBuffers(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + im2col_name, im2col_dim = PULP2DFloatConvIm2ColTemplate.computeTransientBuffersSize( + ctxt, operatorRepresentation)[0] + ctxt.hoistTransientBuffer(im2col_name, im2col_dim) + + operatorRepresentation['ctxtBuffer'] = im2col_name + operatorRepresentation['ctxtBufferSize'] = im2col_dim + return ctxt, operatorRepresentation, [im2col_name] + reference2DTemplate = NodeTemplate(""" -// 2D FP Conv HWC (Name: ${nodeName}, Op: ${nodeOp}) -BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; - - for (uint32_t n=0; n<${batch}; ++n) { - Conv2d_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( - ref_${data_out}_${data_in}, ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, - ${weight}, ${ch_im_out}, - ${dim_kernel_x}, ${dim_kernel_y}, - ${stride_x}, ${stride_y}, - ref_${data_out}_${data_out}, - ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} - ); - ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y}; - ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; - } -END_SINGLE_CORE +// 2D FP Conv HWC Parallel (Name: ${nodeName}, Op: ${nodeOp}) + +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int16_t ${nodeName}_ch_out_chunk = (${ch_im_out} >> ${nodeName}_log2Core) + ((${ch_im_out} & (NUM_CORES-1))!=0); +int16_t ${nodeName}_ch_out_start = MIN(${nodeName}_ch_out_chunk*${nodeName}_core_id, ${ch_im_out}); +int16_t ${nodeName}_ch_out_stop = MIN(${nodeName}_ch_out_start + ${nodeName}_ch_out_chunk, ${ch_im_out}); +int16_t ${nodeName}_ch_out_count = ${nodeName}_ch_out_stop - ${nodeName}_ch_out_start; + +${weight_type.typeName} ${nodeName}_weight_ptr = ${weight} + ${nodeName}_ch_out_start * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}; + +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + + +for (uint32_t n=0; n<${batch}; ++n) { + + Conv2d_ChannelRange_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( + ref_${data_out}_${data_in}, ${dim_im_in_y}, ${dim_im_in_x}, ${ch_im_in}, + ${nodeName}_weight_ptr, ${nodeName}_ch_out_count, + ${dim_kernel_y}, ${dim_kernel_x}, + ${stride_y}, ${stride_x}, + ref_${data_out}_${data_out}, ${ch_im_out}, ${nodeName}_ch_out_start, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} + ); + + + ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; +} + +""") + +reference2DIm2ColTemplate = PULP2DFloatConvIm2ColTemplate(""" +// 2D FP Conv HWC Parallel with Im2Col (Name: ${nodeName}, Op: ${nodeOp}) +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int16_t ${nodeName}_ch_out_chunk = (${ch_im_out} >> ${nodeName}_log2Core) + ((${ch_im_out} & (NUM_CORES-1))!=0); +int16_t ${nodeName}_ch_out_start = MIN(${nodeName}_ch_out_chunk*${nodeName}_core_id, ${ch_im_out}); +int16_t ${nodeName}_ch_out_stop = MIN(${nodeName}_ch_out_start + ${nodeName}_ch_out_chunk, ${ch_im_out}); +int16_t ${nodeName}_ch_out_count = ${nodeName}_ch_out_stop - ${nodeName}_ch_out_start; + +${weight_type.typeName} ${nodeName}_weight_ptr = ${weight} + ${nodeName}_ch_out_start * ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}; + + +uint32_t ${nodeName}_im2col_size_per_core = ${ch_im_in} * ${dim_kernel_x} * ${dim_kernel_y}; +${data_out_type.typeName} ${nodeName}_im2col_buffer = ((${data_out_type.typeName})${ctxtBuffer}) + ${nodeName}_core_id * ${nodeName}_im2col_size_per_core; + +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + + Conv2d_Im2Col_ChannelRange_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( + ref_${data_out}_${data_in}, + ${dim_im_in_y}, + ${dim_im_in_x}, + ${ch_im_in}, + ${nodeName}_weight_ptr, + ${nodeName}_ch_out_count, + ${dim_kernel_y}, + ${dim_kernel_x}, + ${stride_y}, + ${stride_x}, + ref_${data_out}_${data_out}, + ${ch_im_out}, + ${nodeName}_ch_out_start, + ${padding_y_top}, + ${padding_y_bottom}, + ${padding_x_left}, + ${padding_x_right}, + ${nodeName}_im2col_buffer + ); + + ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; +} """) diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py new file mode 100644 index 0000000000..40890b3426 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGELUTemplate.py @@ -0,0 +1,37 @@ +# ---------------------------------------------------------------------- +# +# File: FloatGELUTemplate.py +# +# Last edited: 04.05.2025 +# +# Copyright (C) 2021, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// GELU Parallel (Name: ${nodeName}, Op: ${nodeOp}) +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int16_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +int16_t ${nodeName}_chunk_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size}); +int16_t ${nodeName}_chunk_stop = MIN(${nodeName}_chunk_start + ${nodeName}_chunk, ${size}); + +GELU_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_sigmoid_chunk(${data_in}, ${data_out}, ${nodeName}_chunk_start, ${nodeName}_chunk_stop); +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py new file mode 100644 index 0000000000..ccb4c03751 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatLayernormTemplate.py @@ -0,0 +1,62 @@ +# ---------------------------------------------------------------------- +# +# File: FloatLayernormTemplate.py +# +# Last edited: 23.01.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// FloatLayernorm Parallel (Name: ${nodeName}, Op: ${nodeOp}) + +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); + +int32_t ${nodeName}_seq_length = ${size} / ${lastDimLength}; +int32_t ${nodeName}_chunk = (${nodeName}_seq_length >> ${nodeName}_log2Core) + + ((${nodeName}_seq_length & (NUM_CORES-1)) != 0); +int32_t ${nodeName}_start = MIN(${nodeName}_chunk * ${nodeName}_core_id, ${nodeName}_seq_length); +int32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${nodeName}_seq_length); + + +int32_t ${nodeName}_elem_start = ${nodeName}_start * ${lastDimLength}; +int32_t ${nodeName}_elem_end = ${nodeName}_end * ${lastDimLength}; +int32_t ${nodeName}_elem_count = ${nodeName}_elem_end - ${nodeName}_elem_start; + + +const float* ${nodeName}_data_in_ptr = ${data_in} + ${nodeName}_elem_start; +float* ${nodeName}_data_out_ptr = ${data_out} + ${nodeName}_elem_start; + + +if (${nodeName}_elem_count > 0) { + Layernorm_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( + ${nodeName}_data_in_ptr, + ${nodeName}_data_out_ptr, + ${weight}, + ${bias}, + ${epsilon}, + ${nodeName}_elem_count, + ${lastDimLength} + ); +} + +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py new file mode 100644 index 0000000000..7d558b7100 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMatMulTemplate.py @@ -0,0 +1,53 @@ +# ---------------------------------------------------------------------- +# +# File: F;FloatMatMul.py +# +# Last edited: 28.03.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the Licens +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// Matmul with row parallelism (Name: ${nodeName}, Op: ${nodeOp}) + +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int32_t ${nodeName}_M_chunk = (${M} >> ${nodeName}_log2Core) + ((${M} & (NUM_CORES-1))!=0); +int32_t ${nodeName}_M_start = MIN(${nodeName}_core_id * ${nodeName}_M_chunk, ${M}); +int32_t ${nodeName}_M_end = MIN(${nodeName}_M_start + ${nodeName}_M_chunk, ${M}); +int32_t ${nodeName}_M_size = ${nodeName}_M_end - ${nodeName}_M_start; + +for(uint32_t b=0; b<${batch}; b++) { + ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N}; + ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O}; + ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O}; + + if (${nodeName}_M_size > 0) { + MatMul_fp32_fp32_fp32_unroll1x7( + batch_A + ${nodeName}_M_start * ${N}, + batch_B, + batch_out + ${nodeName}_M_start * ${O}, + ${nodeName}_M_size, + ${N}, + ${O} + ); + } +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py index 5c58ed6723..fd1e83b9b1 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMaxPoolTemplate.py @@ -26,23 +26,29 @@ from Deeploy.DeeployTypes import NodeTemplate referenceTemplate = NodeTemplate(""" +// 2D Float MaxPool Channel Parallel (Name: ${nodeName}, Op: ${nodeOp}) -// 2D Float MaxPool (Name: ${nodeName}, Op: ${nodeOp}) -BEGIN_SINGLE_CORE - ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; - ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int16_t ${nodeName}_ch_chunk = (${ch_im_in} >> ${nodeName}_log2Core) + ((${ch_im_in} & (NUM_CORES-1))!=0); +int16_t ${nodeName}_ch_start = MIN(${nodeName}_ch_chunk*${nodeName}_core_id, ${ch_im_in}); +int16_t ${nodeName}_ch_stop = MIN(${nodeName}_ch_start + ${nodeName}_ch_chunk, ${ch_im_in}); +int16_t ${nodeName}_ch_count = ${nodeName}_ch_stop - ${nodeName}_ch_start; - for (uint32_t n=0; n<${batch}; ++n) { - MaxPool2d_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( - ref_${data_out}_${data_in}, - ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, - ${dim_kernel_x}, ${dim_kernel_y}, - ${stride_x}, ${stride_y}, - ref_${data_out}_${data_out}, - ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right} - ); - ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y}; - ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; - } -END_SINGLE_CORE -""") +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + MaxPool2d_ChannelRange_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC( + ref_${data_out}_${data_in}, + ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ref_${data_out}_${data_out}, + ${padding_y_top}, ${padding_y_bottom}, ${padding_x_left}, ${padding_x_right}, + ${nodeName}_ch_start, ${nodeName}_ch_count + ); + ref_${data_out}_${data_in} += ${ch_im_in}*${dim_im_in_x}*${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out}*${dim_im_out_x}*${dim_im_out_y}; +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py new file mode 100644 index 0000000000..a6e93ae6ae --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/FloatReluTemplate.py @@ -0,0 +1,44 @@ +# ---------------------------------------------------------------------- +# +# File: FloatReluTemplate.py +# +# Last edited: 04.05.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// Parallel ReLU (Name: ${nodeName}, Op: ${nodeOp}) +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int32_t ${nodeName}_chunk = (${size} >> ${nodeName}_log2Core) + ((${size} & (NUM_CORES-1))!=0); +int32_t ${nodeName}_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${size}); +int32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, ${size}); +int32_t ${nodeName}_local_size = ${nodeName}_end - ${nodeName}_start; + +if (${nodeName}_local_size > 0) { + Relu_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( + ${data_in} + ${nodeName}_start, + ${data_out} + ${nodeName}_start, + ${nodeName}_local_size + ); +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py index be2fbc796c..01edb04676 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatSoftmaxTemplate.py @@ -26,8 +26,25 @@ from Deeploy.DeeployTypes import NodeTemplate referenceTemplate = NodeTemplate(""" -// Softmax (Name: ${nodeName}, Op: ${nodeOp}) -SINGLE_CORE Softmax_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}(${data_in}, ${data_out}, ${size}, ${lastDimLength}); +// Softmax Parallel (Name: ${nodeName}, Op: ${nodeOp}) +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_log2Core = log2(NUM_CORES); +int32_t ${nodeName}_num_vectors = ${size} / ${lastDimLength}; +int32_t ${nodeName}_chunk = (${nodeName}_num_vectors >> ${nodeName}_log2Core) + ((${nodeName}_num_vectors & (NUM_CORES-1))!=0); +int32_t ${nodeName}_vector_start = MIN(${nodeName}_chunk*${nodeName}_core_id, ${nodeName}_num_vectors); +int32_t ${nodeName}_vector_end = MIN(${nodeName}_vector_start + ${nodeName}_chunk, ${nodeName}_num_vectors); +int32_t ${nodeName}_local_size = (${nodeName}_vector_end - ${nodeName}_vector_start) * ${lastDimLength}; + +if (${nodeName}_local_size > 0) { + int32_t ${nodeName}_data_offset = ${nodeName}_vector_start * ${lastDimLength}; + + Softmax_fp${data_in_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}( + ${data_in} + ${nodeName}_data_offset, + ${data_out} + ${nodeName}_data_offset, + ${nodeName}_local_size, + ${lastDimLength} + ); +} """) referenceGradientTemplate = NodeTemplate(""" diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py index 457bd3fda7..ea94e2db7a 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py @@ -355,25 +355,17 @@ def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkCo strides = parseDict["strides"] padding = parseDict["pads"] - # VIC: Force at least one row of A and one col of B in the GEMM (since it's a im2col Conv) to avoid partial results + # RW: Conv only tiled on outchannel + tilerModel.addConstraint(inputHeightVar == parseDict['dim_im_in_x']) + tilerModel.addConstraint(inputWidthVar == parseDict['dim_im_in_y']) tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in']) - if (parseDict["ch_im_out"] >= 8): - tilerModel.addMinTileSizeConstraint(parseDict, 'ch_im_out', outputChannelVar, 8) - - tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x']) - tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y']) - tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in']) - - # VIC: Constraint the minimum tile size such that we can apply at least one kernel on it - tilerModel.addConstraint(inputHeightVar >= parseDict['dim_kernel_x']) - tilerModel.addConstraint(inputWidthVar >= parseDict['dim_kernel_y']) - tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x']) tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y']) + tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in']) - tilerModel.addConstraint((inputHeightVar % strides[0]) == 0) - tilerModel.addConstraint((inputWidthVar % strides[1]) == 0) + if (parseDict["ch_im_out"] >= 8): + tilerModel.addMinTileSizeConstraint(parseDict, 'ch_im_out', outputChannelVar, 8) return tilerModel diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py index 7f8a456265..b72cc9115e 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/GEMMTileConstraint.py @@ -235,6 +235,7 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw dimOffsetA = len(bufferA.shape) - 2 dimOffsetB = len(bufferB.shape) - 2 + dimOffsetC = len(bufferC.shape) - 2 dimOffsetOut = len(outputBuffer.shape) - 2 AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA']) @@ -253,8 +254,8 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw # Add GEMM Geometrical constraints tilerModel.addConstraint(ASecondDimVar == BFirstDimVar) - addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = 0) - addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = 1) + addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC) + addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1) tilerModel.addConstraint(outputFirstDimVar == addDimVar_1) tilerModel.addConstraint(outputSecondDimVar == addDimVar_2) diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py index 21d638d0b8..78fb77cf77 100644 --- a/Deeploy/Targets/PULPOpen/Tiler.py +++ b/Deeploy/Targets/PULPOpen/Tiler.py @@ -28,7 +28,7 @@ from Deeploy.CommonExtensions.CodeTransformationPasses.MemoryAllocation import MemoryPassthroughGeneration from Deeploy.DeeployTypes import CodeTransformation -from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicReshapeBindings +from Deeploy.Targets.Generic.Bindings import BasicReshapeBindings from Deeploy.Targets.Generic.TileConstraints.AddTileConstraint import AddTileConstraint from Deeploy.Targets.Generic.TileConstraints.ConcatTileConstraint import ConcatTileConstraint from Deeploy.Targets.Generic.TileConstraints.iHardswishTileConstraint import iHardswishTileConstraint @@ -40,13 +40,14 @@ from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint -from Deeploy.Targets.PULPOpen.Bindings import ForkTransformer, PULPConcatBindings, PULPFloatConv2DBindings, \ +from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \ PULPFloatGELUBinding, PULPFloatGEMMBindings, PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, \ PULPiRQSGELUBindings, PULPLayernormBinding, PULPMatMulBindings, PULPMaxPool2DBindings, PULPMulBindings, \ PULPReduceSumBindings, PULPReluBinding, PULPRQAddBindings, PULPRQSBindings, PULPRQSConv2DBindings, \ PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, \ PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSoftmaxBindings, PULPSoftmaxCrossEntropyLossBindings, \ - PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings + PULPSoftmaxCrossEntropyLossGradBindings, PULPSoftmaxGradBindings, PULPTransposeBindings, \ + PULPUniformRQSBindings from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint @@ -114,11 +115,7 @@ PULPTransposeTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPTransposeBindings, tileConstraint = TransposeTileConstraint()) -_PULPAddBindings = copy.deepcopy(BasicAddBindings) -for binding in _PULPAddBindings: - binding.codeTransformer = ForkTransformer - -PULPAddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = _PULPAddBindings, +PULPAddTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPAddBindings, tileConstraint = AddTileConstraint()) PULPSoftmaxTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSoftmaxBindings, diff --git a/Deeploy/Targets/Redmule/Bindings.py b/Deeploy/Targets/Redmule/Bindings.py new file mode 100644 index 0000000000..df811b8b5f --- /dev/null +++ b/Deeploy/Targets/Redmule/Bindings.py @@ -0,0 +1,52 @@ +# ---------------------------------------------------------------------- +# +# File: NeurekaBindings.py +# +# Last edited: 10.07.2024 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: +# Luka Macan, University of Bologna +# Moritz Scherer, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t +from Deeploy.DeeployTypes import NodeBinding +from Deeploy.Targets.Generic.TypeCheckers import MatMulChecker, ConvChecker, GEMMChecker +from Deeploy.Targets.Redmule.Templates import MatmulTemplate, ConvTemplate, GEMMTemplate +from Deeploy.Targets.PULPOpen.Bindings import ForkTransformer + +RedmuleMatmulBindings = [ + NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + MatmulTemplate.referenceTemplate, ForkTransformer) +] + +RedmuleConv2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), ConvTemplate.reference2DIm2ColTemplate, + ForkTransformer) +] + +RedmuleGEMMBindings = [ + NodeBinding( + GEMMChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), GEMMTemplate.referenceTemplate, + ForkTransformer) +] \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/Deployer.py b/Deeploy/Targets/Redmule/Deployer.py new file mode 100644 index 0000000000..23b322bbfc --- /dev/null +++ b/Deeploy/Targets/Redmule/Deployer.py @@ -0,0 +1,52 @@ +# ---------------------------------------------------------------------- +# +# File: Deployer.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Dict, Type + +import onnx_graphsurgeon as gs + +from Deeploy.AbstractDataTypes import Pointer +from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer +from Deeploy.Targets.Redmule.TopologyOptimizationPasses.Passes import RedMuleAdjustWeightMemoryLayoutPass, RedMuleGEMMTransposePass +class RedmuleDeployer(PULPDeployer): + + def __init__(self, + graph: gs.Graph, + deploymentPlatform: DeploymentPlatform, + inputTypes: Dict[str, Type[Pointer]], + loweringOptimizer: TopologyOptimizer, + scheduler: Callable = lambda graph: list(graph.nodes), + name: str = 'DeeployNetwork', + default_channels_first = False, + deeployStateDir: str = "DeeployStateDir", + inputOffsets = {}): + super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name, + default_channels_first, deeployStateDir, inputOffsets) + + self.loweringOptimizer.passes += [ + RedMuleAdjustWeightMemoryLayoutPass("Redmule"), + RedMuleGEMMTransposePass("Redmule") + ] \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/Engine.py b/Deeploy/Targets/Redmule/Engine.py new file mode 100644 index 0000000000..1022362c57 --- /dev/null +++ b/Deeploy/Targets/Redmule/Engine.py @@ -0,0 +1,65 @@ +# ---------------------------------------------------------------------- +# +# File: Engine.py +# +# Last edited: 26.07.2024 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Moritz Scherer, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +import onnx_graphsurgeon as gs +from Deeploy.Targets.Generic.Layers import GEMMLayer +from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper +from Deeploy.Targets.Generic.Layers import MatMulLayer, ConvLayer +from Deeploy.Targets.Generic.Parsers import MatMulParser +from Deeploy.Targets.Redmule.Tiler import RedmuleMatMulTilingReadyBindings, RedmuleConvTilingReadyBindings, RedmuleGEMMTilingReadyBindings +from Deeploy.Targets.PULPOpen.Parsers import PULPFPConv2DParser +from Deeploy.Targets.Redmule.Parsers import GEMMRedmuleParser + +MatMulRedmuleMapper = NodeMapper( + MatMulParser(), RedmuleMatMulTilingReadyBindings) +Conv2DRedmuleMapper = NodeMapper( + PULPFPConv2DParser(), RedmuleConvTilingReadyBindings) +GEMMMRedmuleMapper = NodeMapper(GEMMRedmuleParser(), RedmuleGEMMTilingReadyBindings) + +RedmuleMapping = { + 'MatMul': MatMulLayer([MatMulRedmuleMapper]), + 'Conv': ConvLayer([Conv2DRedmuleMapper]), + 'Gemm': GEMMLayer([GEMMMRedmuleMapper]), +} + +_includeList = [] + +_redmuleInitCode = r""" +// Redmule engine initialization +""" + + +class RedmuleEngine(DeploymentEngine): + + def __init__(self, + name: str, + Mapping = RedmuleMapping, + initCode: str = _redmuleInitCode, + includeList: List[str] = _includeList) -> None: + super().__init__(name, Mapping, initCode, includeList) + + diff --git a/Deeploy/Targets/Redmule/Parsers.py b/Deeploy/Targets/Redmule/Parsers.py new file mode 100644 index 0000000000..383fe59f31 --- /dev/null +++ b/Deeploy/Targets/Redmule/Parsers.py @@ -0,0 +1,99 @@ +# ---------------------------------------------------------------------- +# +# File: BasicParsers.py +# +# Last edited: 15.12.2021 +# +# Copyright (C) 2021, ETH Zurich and University of Bologna. +# +# Authors: +# - Moritz Scherer, ETH Zurich +# - Victor Jung, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from typing import Tuple + +import numpy as np +import onnx_graphsurgeon as gs + +from Deeploy.DeeployTypes import NetworkContext, NodeParser +from Deeploy.Targets.Generic.Parsers import MatMulParser + +class GEMMRedmuleParser(MatMulParser): + + def __init__(self, noBiasHoisting = True): + self.noBiasHoisting = noBiasHoisting + super().__init__() + + def parseNode(self, node: gs.Node) -> (bool): + + ret = all([ + len(node.inputs) >= 2, + len(node.outputs) == 1, + node.attrs['alpha'] == 1 + ]) + + if ret: + if 'transA' in node.attrs: + self.operatorRepresentation['transA'] = node.attrs['transA'] + else: + self.operatorRepresentation['transA'] = 0 + + if 'transB' in node.attrs: + self.operatorRepresentation['transB'] = node.attrs['transB'] + else: + self.operatorRepresentation['transB'] = 0 + if 'alpha' in node.attrs: + self.operatorRepresentation['alpha'] = node.attrs['alpha'] + else: + self.operatorRepresentation['alpha'] = 1 + if 'beta' in node.attrs: + self.operatorRepresentation['beta'] = node.attrs['beta'] + else: + self.operatorRepresentation['beta'] = 1 + + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + inputs = ['A', 'B'] + outputs = ['data_out'] + + for idx, inputNode in enumerate(node.inputs): + if idx < len(inputs): + self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name + for idx, outputNode in enumerate(node.outputs): + self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name + + if len(node.inputs) == 3: + self.operatorRepresentation['C'] = newCtxt.lookup(node.inputs[2].name).name + elif not self.noBiasHoisting: + values = np.zeros((1)) + zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values) + newCtxt.hoistConstant(zeroTensor) + self.operatorRepresentation['C'] = f'{node.name}_C_Tensor' + + self.operatorRepresentation['size'] = np.prod(newCtxt.lookup(node.inputs[0].name).shape) + + return newCtxt, ret \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/Platform.py b/Deeploy/Targets/Redmule/Platform.py new file mode 100644 index 0000000000..c0587a4ead --- /dev/null +++ b/Deeploy/Targets/Redmule/Platform.py @@ -0,0 +1,45 @@ +# ---------------------------------------------------------------------- +# +# File: Platform.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.DeeployTypes import TopologyOptimizer +from Deeploy.Targets.Redmule.Engine import RedmuleEngine +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, \ + PULPOptimizer, PULPPlatform, PULPStructBuffer, PULPTransientBuffer, PULPVariableBuffer, PULPConstantBuffer + +RedmuleOptimizer = TopologyOptimizer([ + *PULPOptimizer.passes +]) + +class RedmulePlatform(PULPPlatform): + + def __init__(self, + engines = [RedmuleEngine("Redmule"), PULPClusterEngine("PULPCluster")], + variableBuffer = PULPVariableBuffer, + constantBuffer = PULPConstantBuffer, + structBuffer = PULPStructBuffer, + transientBuffer = PULPTransientBuffer) -> None: + super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) + + diff --git a/Deeploy/Targets/Redmule/Templates/ConvTemplate.py b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py new file mode 100644 index 0000000000..5ad5f51e5f --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py @@ -0,0 +1,88 @@ +# ---------------------------------------------------------------------- +# +# File: ConvTemplate.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NodeTemplate + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class RedmuleFloatConvIm2ColTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + im2col_dim = 4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] * + operatorRepresentation['dim_kernel_y']) + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + return [(im2col_name, im2col_dim)] + + def hoistTransientBuffers(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + im2col_name, im2col_dim = RedmuleFloatConvIm2ColTemplate.computeTransientBuffersSize(ctxt, operatorRepresentation)[0] + ctxt.hoistTransientBuffer(im2col_name, im2col_dim) + + operatorRepresentation['ctxtBuffer'] = im2col_name + operatorRepresentation['ctxtBufferSize'] = im2col_dim + return ctxt, operatorRepresentation, [im2col_name] + + +reference2DIm2ColTemplate = RedmuleFloatConvIm2ColTemplate (""" +// 2D FP Conv HWC Parallel with Im2Col (Name: ${nodeName}, Op: ${nodeOp}) +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + + Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC_8_Redmule( + ref_${data_out}_${data_in}, + ${dim_im_in_y}, + ${dim_im_in_x}, + ${ch_im_in}, + ${weight}, + ${dim_kernel_y}, + ${dim_kernel_x}, + ${stride_y}, + ${stride_x}, + ref_${data_out}_${data_out}, + ${ch_im_out}, + ${padding_y_top}, + ${padding_y_bottom}, + ${padding_x_left}, + ${padding_x_right}, + ${ctxtBuffer} + ); + + ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py new file mode 100644 index 0000000000..1ac45c3e6d --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py @@ -0,0 +1,62 @@ +# ---------------------------------------------------------------------- +# +# File: MatMul.py.py +# +# Last edited: 27.01.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the Licens +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// GEMM using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp}) + +int8_t ${nodeName}_core_id = pi_core_id(); + +if (${nodeName}_core_id == 0) { + for(uint32_t b=0; b<${batch}; b++) { + ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N}; + ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O}; + ${C_type.typeName} batch_C = ${C} + b * ${M} * ${O}; + ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O}; + + % if beta == 0: + MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule( + (const float32_t *) batch_A, + (const float32_t *) batch_B, + (float32_t *) batch_out, + ${M}, + ${N}, + ${O} + ); + % else: + Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule( + (const float32_t *) batch_A, + (const float32_t *) batch_B, + (const float32_t *) batch_C, + (float32_t *) batch_out, + ${M}, + ${N}, + ${O} + ); + % endif + } +} +""" +) \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py new file mode 100644 index 0000000000..cb077ca897 --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py @@ -0,0 +1,49 @@ +# ---------------------------------------------------------------------- +# +# File: MatMul.py.py +# +# Last edited: 27.01.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the Licens +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// Matmul using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp}) + +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_num_cores = NUM_CORES; + +if (${nodeName}_core_id == 0) { + for(uint32_t b=0; b<${batch}; b++) { + ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N}; + ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O}; + ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O}; + + MatMul_fp32_fp32_fp32_Redmule( + (const float32_t *) batch_A, + (const float32_t *) batch_B, + (float32_t *) batch_out, + ${M}, + ${N}, + ${O} + ); + } +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/Templates/__init__.py b/Deeploy/Targets/Redmule/Templates/__init__.py new file mode 100644 index 0000000000..a73187ca8f --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/__init__.py @@ -0,0 +1,26 @@ +# ---------------------------------------------------------------------- +# +# File: __init__.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import * diff --git a/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py new file mode 100644 index 0000000000..61ef736773 --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py @@ -0,0 +1,283 @@ + + +# ---------------------------------------------------------------------- +# +# File: ConvTileConstraint.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: +# - Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel, PerformanceHint +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + + +class RedmuleConv2DTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + inputBufferName = parseDict['data_in'] + weightBufferName = parseDict['weight'] + outputBufferName = parseDict['data_out'] + + strides = parseDict["strides"] + padding = parseDict["pads"] + dilation = parseDict["dilations"] + + # Add I/O dimensions to the model as variables + for bufferName in [inputBufferName, weightBufferName, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0) + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3) + + # Updated dimension indexes for (H, W, Cin, Cout) format + weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0) + weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1) + weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2) + weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3) + + outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0) + outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1) + outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) + outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) + + # Map output dims to inputs dims + tilerModel.addConstraint(outputBatchVar == inputBatchVar) # Batch + tilerModel.addConstraint(outputChannelVar == weightOutChannelVar) # Output Channel (now at index 3) + + inputBuffer = ctxt.lookup(inputBufferName) + + effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1])) + effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2])) + + tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (weightHeightVar - 1) - 1) // strides[0] + 1)) + tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (weightWidthVar - 1) - 1) // strides[1] + 1)) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + inputBuffer = ctxt.lookup(name = parseDict['data_in']) + weightBuffer = ctxt.lookup(name = parseDict['weight']) + + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3) + + # Updated dimension indexes for (H, W, Cin, Cout) format + weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0) + weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1) + weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2) + weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 3) + + strides = parseDict["strides"] + padding = parseDict["pads"] + + tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in']) + # RW: Conv only tiled on outchannel + tilerModel.addConstraint(inputHeightVar == parseDict['dim_im_in_x']) + tilerModel.addConstraint(inputWidthVar == parseDict['dim_im_in_y']) + tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in']) + + tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x']) + tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y']) + tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in']) + + outChannel = parseDict["ch_im_out"] + if outChannel >= 12: + tilerModel.addTileSizeDivisibleConstraint(parseDict, + "ch_im_out", + weightOutChannelVar, + 12, + strategy = PerformanceHint(priority = 1)) + else: + tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max(), strategy = PerformanceHint(1)) + + + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + inputBuffer = ctxt.lookup(name = parseDict['data_in']) + weightBuffer = ctxt.lookup(name = parseDict['weight']) + + symbolicParseDict = parseDict.copy() + symbolicParseDict['dim_im_in_x'] = tilerModel.getTensorDimVar(inputBuffer.name, 1) + # Using updated dimension indexes for kernel dimensions + symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 0) + symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 1) + + return symbolicParseDict + + @staticmethod + def computeMargins(kernelShape: Tuple[int, ...]) -> Tuple[int, ...]: + if kernelShape[1] % 2 == 0: + leftMargin = 0 + rightMargin = 0 + else: + leftMargin = ((kernelShape[1]) // 2) + rightMargin = ((kernelShape[1]) // 2) + + if kernelShape[0] % 2 == 0: + topMargin = 0 + bottomMargin = 0 + else: + topMargin = ((kernelShape[0]) // 2) + bottomMargin = ((kernelShape[0]) // 2) + + return leftMargin, rightMargin, topMargin, bottomMargin + + @staticmethod + def computeInputCube(kernelShape: Tuple[int, ...], pads: Tuple[int, ...], strides: Tuple[int, ...], + weightChannels: int, outputCube: HyperRectangle, + outputDims: Tuple[int, ...]) -> Tuple[HyperRectangle, Tuple[int, ...]]: + + (BatchOffset, HOffset, WOffset, COffset) = outputCube.offset + (BatchSize, HSize, WSize, CSize) = outputCube.dims + + leftMargin, rightMargin, topMargin, bottomMargin = RedmuleConv2DTileConstraint.computeMargins(kernelShape) + + padding_top = (HOffset == 0) * pads[0] + padding_bottom = (HOffset + HSize == outputDims[1]) * pads[2] + + padding_left = (WOffset == 0) * pads[1] + padding_right = (WOffset + WSize == outputDims[2]) * pads[3] + + inputHOffset = HOffset * strides[0] - topMargin * (HOffset != 0) + inputWOffset = WOffset * strides[1] - leftMargin * (WOffset != 0) + + inputHSize = HSize * strides[0] + (topMargin + bottomMargin) - (padding_top + padding_bottom) + inputWSize = WSize * strides[1] + (leftMargin + rightMargin) - (padding_left + padding_right) + + InCube = HyperRectangle((BatchOffset, inputHOffset, inputWOffset, 0), + (BatchSize, inputHSize, inputWSize, weightChannels)) + + return InCube, (padding_left, padding_right, padding_top, padding_bottom) + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'weight', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + varWeight = operatorRepresentation['weight'] + varOut = operatorRepresentation['data_out'] + + inputInCubes = [] + inputWeightCubes = [] + replacements: Dict[str, List[int]] = { + "dim_im_in_x": [], + "dim_im_in_y": [], + "dim_im_out_x": [], + "dim_im_out_y": [], + "ch_im_out": [], + "padding_y_top": [], + "padding_y_bottom": [], + "padding_x_left": [], + "padding_x_right": [] + } + + replacementTypes = { + "dim_im_in_x": PointerClass(uint16_t), + "dim_im_in_y": PointerClass(uint16_t), + "dim_im_out_x": PointerClass(uint16_t), + "dim_im_out_y": PointerClass(uint16_t), + "ch_im_out": PointerClass(uint16_t), + "padding_y_top": PointerClass(uint8_t), + "padding_y_bottom": PointerClass(uint8_t), + "padding_x_left": PointerClass(uint8_t), + "padding_x_right": PointerClass(uint8_t) + } + + # Updated dimension indexes for (H, W, Cin, Cout) format + weightH = ctxt.lookup(varWeight).shape[0] # Now index 0 + weightW = ctxt.lookup(varWeight).shape[1] # Now index 1 + weightC = ctxt.lookup(varWeight).shape[2] # Now index 2 (Cin) + + pads = operatorRepresentation['pads'] + strides = operatorRepresentation['strides'] + + for cube in outputCubes: + (BatchOffset, HOffset, WOffset, COffset) = cube.offset + (BatchSize, HSize, WSize, CSize) = cube.dims + + InCube, padding_tuple = RedmuleConv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, weightC, + cube, + ctxt.lookup(varOut).shape) + + padding_left, padding_right, padding_top, padding_bottom = padding_tuple + + replacements['dim_im_in_x'].append(InCube.dims[1]) + replacements['dim_im_in_y'].append(InCube.dims[2]) + replacements['dim_im_out_x'].append(HSize) + replacements['dim_im_out_y'].append(WSize) + replacements['ch_im_out'].append(CSize) + + replacements['padding_y_top'].append(padding_top) + replacements['padding_y_bottom'].append(padding_bottom) + replacements['padding_x_left'].append(padding_left) + replacements['padding_x_right'].append(padding_right) + + inputInCubes.append(InCube) + + # Updated WeightCube for (H, W, Cin, Cout) format + # COffset is now applied to dimension 3 (Cout) + WeightCube = HyperRectangle((0, 0, 0, COffset), (weightH, weightW, weightC, CSize)) + + inputWeightCubes.append(WeightCube) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for a, b in zip(inputInCubes, inputWeightCubes): + inputLoadSchedule.append({"data_in": a, "weight": b}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py new file mode 100644 index 0000000000..a91a0b929c --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py @@ -0,0 +1,198 @@ + +# ---------------------------------------------------------------------- +# +# File: GEMMTileConstraint.py +# +# Last edited: 02.06.2023 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: +# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich +# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class RedmuleGEMMTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + bufferC = ctxt.lookup(name = parseDict['C']) + outputBuffer = ctxt.lookup(name = parseDict['data_out']) + + # Add I/O dimensions to the model as variables + for bufferName in [bufferA.name, bufferB.name, bufferC.name, outputBuffer.name]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + dimOffsetA = len(bufferA.shape) - 2 + dimOffsetB = len(bufferB.shape) - 2 + dimOffsetC = len(bufferC.shape) - 2 + dimOffsetOut = len(outputBuffer.shape) - 2 + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = dimOffsetA + 1 - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = dimOffsetB + 1 - parseDict['transB']) + outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut) + outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut + 1) + + # Map output dims to inputs dims + tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar) + tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar) + + # Add GEMM Geometrical constraints + tilerModel.addConstraint(ASecondDimVar == BFirstDimVar) + + addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC) + addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1) + tilerModel.addConstraint(outputFirstDimVar == addDimVar_1) + tilerModel.addConstraint(outputSecondDimVar == addDimVar_2) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + from Deeploy.TilingExtension.TilerModel import PerformanceHint + + bufferA = ctxt.lookup(name=parseDict['A']) + bufferB = ctxt.lookup(name=parseDict['B']) + + tensorsShapeLen = min(len(bufferA.shape), len(bufferB.shape)) + + dimOffsetA = len(bufferA.shape) - 2 + dimOffsetB = len(bufferB.shape) - 2 + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName=bufferA.name, dimIdx=dimOffsetA + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName=bufferA.name, dimIdx=dimOffsetA + 1 - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName=bufferB.name, dimIdx=dimOffsetB + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName=bufferB.name, dimIdx=dimOffsetB + 1 - parseDict['transB']) + + # VIC: We don't want to deal with intermediate results between kernel calls + tilerModel.addConstraint(ASecondDimVar == parseDict['N']) + tilerModel.addConstraint(BFirstDimVar == parseDict['N']) + + tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy=PerformanceHint(1)) + tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy=PerformanceHint(1)) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['A', 'B', 'C', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + transA = operatorRepresentation['transA'] + transB = operatorRepresentation['transB'] + + varA = operatorRepresentation['A'] + varB = operatorRepresentation['B'] + + if transA == 0: + NSize = ctxt.lookup(varA).shape[-1] + else: + NSize = ctxt.lookup(varA).shape[-2] + + NOffset = 0 + + inputACubes = [] + inputBCubes = [] + inputAddCubes = [] + + replacements = {"M": [], "O": [], "batch": []} + + # Every output is constructed by a pair of inputs. Reconstruct this pair. + for cube in outputCubes: + + BSize = 1 + BOffset = 0 + BatchSize = 1 + BatchOffset = 0 + + if len(cube.offset) == 2: + (MOffset, OOffset) = cube.offset + (MSize, OSize) = cube.dims + elif len(cube.offset) == 3: + (BatchOffset, MOffset, OOffset) = cube.offset + (BatchSize, MSize, OSize) = cube.dims + else: + (BatchOffset, BOffset, MOffset, OOffset) = cube.offset + (BatchSize, BSize, MSize, OSize) = cube.dims + + replacements["M"].append(MSize) + replacements["O"].append(OSize) + replacements["batch"].append(BSize) + + if transA == 0: + ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize)) + else: + ACube = HyperRectangle((BatchOffset, BOffset, NOffset, MOffset), (BatchSize, BSize, NSize, MSize)) + + if transB == 0: + BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize)) + else: + BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize)) + + CCube = HyperRectangle(cube.offset, cube.dims) + + inputACubes.append(ACube) + inputBCubes.append(BCube) + inputAddCubes.append(CCube) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + replacements["N"] = [NSize] * len(outputCubes) + + replacementTypes = { + "M": PointerClass(uint16_t), + "N": PointerClass(uint16_t), + "O": PointerClass(uint16_t), + "batch": PointerClass(uint8_t) + } + + for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes): + inputLoadSchedule.append({"A": a, "B": b, "C": c}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + + return VariableReplacementScheme(replacements, replacementTypes), schedule diff --git a/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py new file mode 100644 index 0000000000..f25920f9d2 --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py @@ -0,0 +1,198 @@ +# ---------------------------------------------------------------------- +# +# File: MatMulTileConstraint.py +# +# Last edited: 28.04.2025 +# +# Copyright (C) 2025, ETH Zurich and University of Bologna. +# +# Author: [Your Name] +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import int8_t, uint16_t, uint32_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel, PerformanceHint +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class RedmuleMatmulTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + outputBuffer = ctxt.lookup(name = parseDict['data_out']) + + # Add I/O dimensions to the model as variables + for _buffer in [bufferA, bufferB, outputBuffer]: + tilerModel.addTensorDimToModel(ctxt, _buffer.name) + + tensorsShapeLen = len(bufferA.shape) + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) + outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2)) + outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1)) + + # Map output dims to inputs dims + for idx in range(tensorsShapeLen - 2): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( + tensorName = bufferA.name, dimIdx = idx)) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( + tensorName = bufferB.name, dimIdx = idx)) + + tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar) + tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar) + + # Add GEMM Geometrical constraints + tilerModel.addConstraint(ASecondDimVar == BFirstDimVar) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + + tensorsShapeLen = len(bufferA.shape) + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) + + # VIC: We don't want to deal with intermediate results between kernel calls + tilerModel.addConstraint(ASecondDimVar == parseDict['N']) + tilerModel.addConstraint(BFirstDimVar == parseDict['N']) + + + # Hardware-specific constraints for 4x12 accelerator + tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1)) + + M_full_size = ctxt.lookup(bufferA.name).shape[(tensorsShapeLen - 2) + parseDict['transA']] + if M_full_size >= 16: + tilerModel.addTileSizeDivisibleConstraint(parseDict, + "M", + AFirstDimVar, + 16, + strategy = PerformanceHint(priority = 1)) + else: + tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy = PerformanceHint(1)) + + N_full_size = ctxt.lookup(bufferB.name).shape[(tensorsShapeLen - 2) + parseDict['transB']] + if N_full_size >= 12: + tilerModel.addTileSizeDivisibleConstraint(parseDict, + "O", + BSecondDimVar, + 12, + strategy = PerformanceHint(priority = 1)) + else: + tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1)) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['A', 'B', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + varA = operatorRepresentation['A'] + + NSize = ctxt.lookup(varA).shape[-1] + NOffset = 0 + + inputACubes = [] + inputBCubes = [] + + replacements = {"M": [], "O": [], "batch": []} + + # Every output is constructed by a pair of inputs. Reconstruct this pair. + for cube in outputCubes: + + BSize = 1 + BOffset = 0 + BatchSize = 1 + BatchOffset = 0 + + if len(cube.offset) == 2: + (MOffset, OOffset) = cube.offset + (MSize, OSize) = cube.dims + elif len(cube.offset) == 3: + (BatchOffset, MOffset, OOffset) = cube.offset + (BatchSize, MSize, OSize) = cube.dims + else: + (BatchOffset, BOffset, MOffset, OOffset) = cube.offset + (BatchSize, BSize, MSize, OSize) = cube.dims + + replacements["M"].append(MSize) + replacements["O"].append(OSize) + replacements["batch"].append(BSize) + + ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize)) + BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize)) + + inputACubes.append(ACube) + inputBCubes.append(BCube) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + replacements["N"] = [NSize] * len(outputCubes) + + replacementTypes = { + "M": PointerClass(int8_t), + "N": PointerClass(int8_t), + "O": PointerClass(int8_t), + "batch": PointerClass(int8_t) + } + + for a, b in zip(inputACubes, inputBCubes): + inputLoadSchedule.append({"A": a, "B": b}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + + return VariableReplacementScheme(replacements, replacementTypes), schedule diff --git a/Deeploy/Targets/Redmule/TileConstraints/__init__.py b/Deeploy/Targets/Redmule/TileConstraints/__init__.py new file mode 100644 index 0000000000..a73187ca8f --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/__init__.py @@ -0,0 +1,26 @@ +# ---------------------------------------------------------------------- +# +# File: __init__.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import * diff --git a/Deeploy/Targets/Redmule/Tiler.py b/Deeploy/Targets/Redmule/Tiler.py new file mode 100644 index 0000000000..d131b42d4e --- /dev/null +++ b/Deeploy/Targets/Redmule/Tiler.py @@ -0,0 +1,37 @@ +# ---------------------------------------------------------------------- +# +# File: Tiler.py +# +# Last edited: 26.07.2024 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Moritz Scherer, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.Targets.Redmule.Bindings import RedmuleMatmulBindings, RedmuleConv2DBindings, RedmuleGEMMBindings +from Deeploy.Targets.Redmule.TileConstraints.MatmulTileConstraint import RedmuleMatmulTileConstraint +from Deeploy.Targets.Redmule.TileConstraints.ConvTileConstraint import RedmuleConv2DTileConstraint +from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings +from Deeploy.Targets.Redmule.TileConstraints.GEMMTileConstraint import RedmuleGEMMTileConstraint + +RedmuleMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleMatmulBindings, + tileConstraint = RedmuleMatmulTileConstraint()) +RedmuleConvTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleConv2DBindings, + tileConstraint = RedmuleConv2DTileConstraint()) +RedmuleGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleGEMMBindings, + tileConstraint = RedmuleGEMMTileConstraint()) \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py new file mode 100644 index 0000000000..31c4e17d05 --- /dev/null +++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py @@ -0,0 +1,176 @@ +# ---------------------------------------------------------------------- +# +# File: RedMulePasses.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2025, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +import numpy as np +import numpy.typing as npt +import onnx_graphsurgeon as gs + +from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match, NonBranchingMatcher +from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import ( + _permuteLastTwoDims, + _appendTransposeNode, +) + + + +def _redmule_weight_layout_fun(graph: gs.Graph, match: Match, name: str): + """Convert Conv weights from [cout, h, w, cin] to [h,w,cin, cout] for RedMule accelerator""" + node = list(match.nodes_map.values())[0] + + weightTensor = node.inputs[1] + if isinstance(weightTensor, gs.Constant): + weightTensor.values = np.transpose(weightTensor.values, (1, 2, 3, 0)) + + return graph + + +@contextagnostic +class RedMuleAdjustWeightMemoryLayoutPass(ReplaceSequentialPatternPass): + """Pass to convert Conv weights from [cout, h, w, cin] to [hwcin, cout] for RedMule accelerator""" + + def __init__(self, redmuleEngineName: str): + graph = gs.Graph() + _input = gs.Variable(name='input_1') + output = graph.layer(inputs=[_input], outputs=['convOut'], op='Conv', name='conv') + graph.outputs.append(output) + graph.inputs.append(_input) + + super().__init__( + graph, + _redmule_weight_layout_fun, + "_REDMULE_ADJUST_WEIGHT_MEMORY_LAYOUT_PASS") + + +def _redmule_gemm_transpose_fun(graph: gs.Graph, match: Match, name: str): + """ + Handle GEMM transA and transB attributes for RedMule accelerator + + Properly handles tensors of any dimensionality, ensuring only the last two + dimensions are transposed when needed. + """ + matched_nodes = [m for k, m in match.nodes_map.items()] + gemm_node = matched_nodes[0] + + if 'transA' not in gemm_node.attrs: + gemm_node.attrs['transA'] = 0 + if 'transB' not in gemm_node.attrs: + gemm_node.attrs['transB'] = 0 + if 'alpha' not in gemm_node.attrs: + gemm_node.attrs['alpha'] = 1.0 + if 'beta' not in gemm_node.attrs: + gemm_node.attrs['beta'] = 1.0 + + inputA = gemm_node.inputs[0] + inputB = gemm_node.inputs[1] + + + if gemm_node.attrs['transA'] != 0: + if isinstance(inputA, gs.Constant): + print(f"Physical transpose for constant A: {inputA.name}") + + if len(inputA.values.shape) > 2: + perm = list(range(len(inputA.values.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + inputA.values = np.transpose(inputA.values, perm) + else: + inputA.values = np.transpose(inputA.values) + + gemm_node.attrs['transA'] = 0 + else: + + perm = list(range(len(inputA.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + + + anchorTransposeNode, anchorTransposeOutput = _appendTransposeNode( + inputA, + name + "_A_Transpose", + perm + ) + gemm_node.inputs[0] = anchorTransposeOutput + gemm_node.attrs['transA'] = 0 + graph.nodes.append(anchorTransposeNode) + + + if gemm_node.attrs['transB'] != 0: + if isinstance(inputB, gs.Constant): + + if len(inputB.values.shape) > 2: + + perm = list(range(len(inputB.values.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + + inputB.values = np.transpose(inputB.values, perm) + else: + inputB.values = np.transpose(inputB.values) + + gemm_node.attrs['transB'] = 0 + else: + print(f"Adding transpose node for variable B: {inputB.name}") + + perm = list(range(len(inputB.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + + anchorTransposeNode, anchorTransposeOutput = _appendTransposeNode( + inputB, + name + "_B_Transpose", + perm + ) + gemm_node.inputs[1] = anchorTransposeOutput + gemm_node.attrs['transB'] = 0 + graph.nodes.append(anchorTransposeNode) + + return graph + + +@contextagnostic +class RedMuleGEMMTransposePass(ReplaceSequentialPatternPass): + """Pass to handle GEMM transA and transB attributes for RedMule accelerator""" + + def __init__(self, redmuleEngineName: str): + + pattern = gs.Graph() + + input_a = gs.Variable(name="input_a") + input_b = gs.Variable(name="input_b") + + gemm_output = pattern.layer( + op="Gemm", + name="gemm_node", + inputs=[input_a, input_b], + outputs=["gemm_output"] + ) + + + pattern.inputs = [input_a, input_b] + pattern.outputs = [gemm_output] + + super().__init__( + pattern=pattern, + replacement_fn=_redmule_gemm_transpose_fun, + name="_REDMULE_GEMM_TRANSPOSE_PASS" + ) \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py new file mode 100644 index 0000000000..63063b6066 --- /dev/null +++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py @@ -0,0 +1,26 @@ +# ---------------------------------------------------------------------- +# +# File: __init__.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import * diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt index 888e42ae5a..2bc2a29d57 100644 --- a/DeeployTest/CMakeLists.txt +++ b/DeeployTest/CMakeLists.txt @@ -37,7 +37,7 @@ elseif(DEEPLOY_ARCH STREQUAL PULP) -Wno-pointer-sign ) - if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka) + if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule) set(USE_NEUREKA ON) add_subdirectory(Platforms/Siracusa) elseif(platform STREQUAL PULPOpen) diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytest.c b/DeeployTest/Platforms/Siracusa/src/deeploytest.c index 5f04d78df3..643fb18928 100644 --- a/DeeployTest/Platforms/Siracusa/src/deeploytest.c +++ b/DeeployTest/Platforms/Siracusa/src/deeploytest.c @@ -68,7 +68,7 @@ float diff = expected_val - actual_val; - if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff)) + if ((diff < -2e-4) || (diff > 2e-4) || isnan(diff)) { local_err_count += 1; diff --git a/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/inputs.npz b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/inputs.npz new file mode 100644 index 0000000000..d47edbbed8 Binary files /dev/null and b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/inputs.npz differ diff --git a/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/network.onnx b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/network.onnx new file mode 100644 index 0000000000..70010413cc Binary files /dev/null and b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/network.onnx differ diff --git a/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/outputs.npz b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/outputs.npz new file mode 100644 index 0000000000..d756cdb275 Binary files /dev/null and b/DeeployTest/Tests/CCT/CCT_infer_32_128_1_2/outputs.npz differ diff --git a/DeeployTest/Tests/testFloat2dConvLarge/inputs.npz b/DeeployTest/Tests/testFloat2dConvLarge/inputs.npz new file mode 100644 index 0000000000..5d15b68696 Binary files /dev/null and b/DeeployTest/Tests/testFloat2dConvLarge/inputs.npz differ diff --git a/DeeployTest/Tests/testFloat2dConvLarge/network.onnx b/DeeployTest/Tests/testFloat2dConvLarge/network.onnx new file mode 100644 index 0000000000..dd710f7ec0 Binary files /dev/null and b/DeeployTest/Tests/testFloat2dConvLarge/network.onnx differ diff --git a/DeeployTest/Tests/testFloat2dConvLarge/outputs.npz b/DeeployTest/Tests/testFloat2dConvLarge/outputs.npz new file mode 100644 index 0000000000..f1cf1fa8e9 Binary files /dev/null and b/DeeployTest/Tests/testFloat2dConvLarge/outputs.npz differ diff --git a/DeeployTest/Tests/testFloatMatmulLarge/inputs.npz b/DeeployTest/Tests/testFloatMatmulLarge/inputs.npz new file mode 100644 index 0000000000..06fe42968b Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge/inputs.npz differ diff --git a/DeeployTest/Tests/testFloatMatmulLarge/network.onnx b/DeeployTest/Tests/testFloatMatmulLarge/network.onnx new file mode 100644 index 0000000000..d91cbeeacc Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge/network.onnx differ diff --git a/DeeployTest/Tests/testFloatMatmulLarge/outputs.npz b/DeeployTest/Tests/testFloatMatmulLarge/outputs.npz new file mode 100644 index 0000000000..edd6182cd9 Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge/outputs.npz differ diff --git a/DeeployTest/Tests/testFloatMatmulLarge256/inputs.npz b/DeeployTest/Tests/testFloatMatmulLarge256/inputs.npz new file mode 100644 index 0000000000..43eb2325f9 Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge256/inputs.npz differ diff --git a/DeeployTest/Tests/testFloatMatmulLarge256/network.onnx b/DeeployTest/Tests/testFloatMatmulLarge256/network.onnx new file mode 100644 index 0000000000..cd3d3c1474 Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge256/network.onnx differ diff --git a/DeeployTest/Tests/testFloatMatmulLarge256/outputs.npz b/DeeployTest/Tests/testFloatMatmulLarge256/outputs.npz new file mode 100644 index 0000000000..fe0be0bad6 Binary files /dev/null and b/DeeployTest/Tests/testFloatMatmulLarge256/outputs.npz differ diff --git a/DeeployTest/testRunner_tiled_siracusa_w_redmule.py b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py new file mode 100644 index 0000000000..015ca62085 --- /dev/null +++ b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py @@ -0,0 +1,50 @@ +# ---------------------------------------------------------------------- +# +# File: testRunner_tiled_siracusa_w_neureka.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from testUtils.testRunner import TestRunner, TestRunnerArgumentParser + +if __name__ == "__main__": + + parser = TestRunnerArgumentParser( + tiling_arguments = True, + description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling & Redmule).") + + parser.add_argument('--cores', + metavar = '', + dest = 'cores', + type = int, + default = 1, + help = 'Set number of cluster cores') + args = parser.parse_args() + + testRunner = TestRunner(platform = "Siracusa_w_redmule", + simulator = "gvsoc", + tiling = True, + argument_parser = parser) + + testRunner.cmake_args += f" -D NUM_CORES={args.cores}" + + + testRunner.run() diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 4e24995d78..4b7f845ce1 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -39,13 +39,15 @@ from Deeploy.Targets.Neureka.Deployer import NeurekaDeployer from Deeploy.Targets.Neureka.Platform import MemoryNeurekaPlatform, MemoryNeurekaPlatformWrapper, NeurekaOptimizer, \ NeurekaPlatform +from Deeploy.Targets.Redmule.Deployer import RedmuleDeployer +from Deeploy.Targets.Redmule.Platform import RedmuleOptimizer, RedmulePlatform from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform from Deeploy.Targets.Snitch.Deployer import SnitchDeployer from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool"] -_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch"] +_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Siracusa_w_redmule"] _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS @@ -77,6 +79,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Siracusa_w_neureka": Platform = NeurekaPlatform() + + elif platformName == "Siracusa_w_redmule": + Platform = RedmulePlatform() elif platformName == "Snitch": Platform = SnitchPlatform() @@ -89,7 +94,7 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: def setupMemoryPlatform(platform: DeploymentPlatform, memoryHierarchy: MemoryHierarchy, defaultTargetMemoryLevel: MemoryLevel) -> Union[MemoryPlatform, MemoryPlatformWrapper]: - if isinstance(platform, PULPPlatform): + if isinstance(platform, (PULPPlatform, RedmulePlatform)): return MemoryPULPPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel) elif isinstance(platform, NeurekaPlatform): weightMemoryLevel = memoryHierarchy.memoryLevels["WeightMemory_SRAM"] \ @@ -191,6 +196,22 @@ def mapDeployer(platform: DeploymentPlatform, name = name, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) + + elif isinstance(platform, (RedmulePlatform)): + if loweringOptimizer is None: + loweringOptimizer = RedmuleOptimizer + + if default_channels_first is None: + default_channels_first = False + + deployer = RedmuleDeployer(graph, + platform, + inputTypes, + loweringOptimizer, + scheduler, + name = name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir) elif isinstance(platform, (PULPPlatform, MemoryPULPPlatform, MemoryPULPPlatformWrapper)): diff --git a/Makefile b/Makefile index 806daa274f..35e0d7febb 100644 --- a/Makefile +++ b/Makefile @@ -44,6 +44,7 @@ PICOLIBC_RV32IM_INSTALL_DIR ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32im PICOLIBC_RV32IMC_INSTALL_DIR ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32imc PICOLIBC_RV32IMA_INSTALL_DIR ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32ima PICOLIBC_RV32IMAFD_INSTALL_DIR ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32imafd +PICOLIBC_RV32IMF_INSTALL_DIR ?= ${LLVM_INSTALL_DIR}/picolibc/riscv/rv32imf PULP_SDK_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/pulp-sdk QEMU_INSTALL_DIR ?= ${DEEPLOY_INSTALL_DIR}/qemu @@ -64,7 +65,7 @@ PULP_SDK_COMMIT_HASH ?= 3e1e569bd789a11d9dde6d6b3930849505e68b4a BANSHEE_COMMIT_HASH ?= 0e105921e77796e83d01c2aa4f4cadfa2005b4d9 MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6 SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2 -GVSOC_COMMIT_HASH ?= eeb7ef8c1dfcb944ac80d797a8cea35aacc14ac5 +GVSOC_COMMIT_HASH ?= 35d00d15d7249daaac0de61bd8485fba128e5959 MINIMALLOC_COMMMIT_HASH ?= e9eaf54094025e1c246f9ec231b905f8ef42a29d XTL_VERSION ?= 0.7.5 XSIMD_VERSION ?= 13.2.0 @@ -352,7 +353,18 @@ ${PICOLIBC_RV32IMAFD_INSTALL_DIR}: ${TOOLCHAIN_DIR}/picolibc --cross-file ../scripts/meson-build-script-rv32imafd.txt && \ PATH=${LLVM_INSTALL_DIR}/bin:${PATH} meson install -picolibc-riscv: ${PICOLIBC_RV32IM_INSTALL_DIR} ${PICOLIBC_RV32IMA_INSTALL_DIR} ${PICOLIBC_RV32IMC_INSTALL_DIR} ${PICOLIBC_RV32IMAFD_INSTALL_DIR} +${PICOLIBC_RV32IMF_INSTALL_DIR}: ${TOOLCHAIN_DIR}/picolibc + cd ${TOOLCHAIN_DIR}/picolibc && mkdir -p build-rv32imf && cd build-rv32imf && \ + cp ${TOOLCHAIN_DIR}/meson-build-script-rv32imf.txt ../scripts && \ + PATH=${LLVM_INSTALL_DIR}/bin:${PATH} meson setup --reconfigure -Dincludedir=include \ + -Dlibdir=lib \ + -Dspecsdir=none \ + -Dmultilib=false \ + --prefix ${PICOLIBC_RV32IMF_INSTALL_DIR} \ + --cross-file ../scripts/meson-build-script-rv32imf.txt && \ + PATH=${LLVM_INSTALL_DIR}/bin:${PATH} meson install + +picolibc-riscv: ${PICOLIBC_RV32IM_INSTALL_DIR} ${PICOLIBC_RV32IMA_INSTALL_DIR} ${PICOLIBC_RV32IMC_INSTALL_DIR} ${PICOLIBC_RV32IMAFD_INSTALL_DIR} ${PICOLIBC_RV32IMF_INSTALL_DIR} ${TOOLCHAIN_DIR}/pulp-sdk: cd ${TOOLCHAIN_DIR} && \ @@ -390,7 +402,7 @@ snitch_runtime: ${SNITCH_INSTALL_DIR} ${TOOLCHAIN_DIR}/gvsoc: cd ${TOOLCHAIN_DIR} && \ - git clone https://github.com/gvsoc/gvsoc.git && \ + git clone https://github.com/runwangdl/gvsoc.git && \ cd ${TOOLCHAIN_DIR}/gvsoc && git checkout ${GVSOC_COMMIT_HASH} && \ git submodule update --init --recursive && \ pip install -r core/requirements.txt && pip install -r gapy/requirements.txt diff --git a/TargetLibraries/Generic/inc/kernel/GELU.h b/TargetLibraries/Generic/inc/kernel/GELU.h index 0c6d19d6c7..0825a11e0e 100644 --- a/TargetLibraries/Generic/inc/kernel/GELU.h +++ b/TargetLibraries/Generic/inc/kernel/GELU.h @@ -46,4 +46,9 @@ void GELU_s8_s32(int8_t *data_in, int32_t *data_out, int32_t dataSize, int8_t b, void GELU_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t dataSize); +void GELU_fp32_fp32_sigmoid(float32_t *data_in, float32_t *data_out, int32_t dataSize); + +void GELU_fp32_fp32_sigmoid_chunk(float32_t *data_in, float32_t *data_out, + int32_t start_idx, int32_t end_idx); + #endif //__DEEPLOY_BASIC_MATH_GELU_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/inc/kernel/MatMul.h b/TargetLibraries/Generic/inc/kernel/MatMul.h index d9b35eb1a7..d646950683 100644 --- a/TargetLibraries/Generic/inc/kernel/MatMul.h +++ b/TargetLibraries/Generic/inc/kernel/MatMul.h @@ -62,10 +62,20 @@ void MatMul_s8_s8_s32(int8_t const *__restrict__ pSrcA, uint32_t P, int32_t A_offset, int32_t B_offset, int32_t C_offset); +/******************************************************************************/ +/* Matrix Multiplication (Float32) */ +/******************************************************************************/ void MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, - const float32_t *__restrict__ pSrcB, - float32_t *__restrict__ pDstY, - uint32_t M, - uint32_t N, - uint32_t O); + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, + uint32_t M, + uint32_t N, + uint32_t O); + +void MatMul_fp32_fp32_fp32_unroll1x7(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, + uint32_t M, + uint32_t N, + uint32_t O); #endif //__DEEPLOY_BASIC_MATH_MATMUL_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/src/GELU_fp32.c b/TargetLibraries/Generic/src/GELU_fp32.c index 923dcf9c65..18e5e0a41a 100644 --- a/TargetLibraries/Generic/src/GELU_fp32.c +++ b/TargetLibraries/Generic/src/GELU_fp32.c @@ -36,3 +36,28 @@ void GELU_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t dataSize) { data_out[i] = x * cdf; } } + +void GELU_fp32_fp32_sigmoid(float32_t *data_in, float32_t *data_out, int32_t dataSize) { + + const float32_t scale = 1.702f; + for (int i = 0; i < dataSize; i++) { + float32_t x = data_in[i]; + float32_t sigmoid_in = scale * x; + // sigmoid(z) = 1 / (1 + exp(-z)) + float32_t sigmoid = 1.0f / (1.0f + expf(-sigmoid_in)); + data_out[i] = x * sigmoid; + } +} + +void GELU_fp32_fp32_sigmoid_chunk(float32_t *data_in, float32_t *data_out, int32_t start_idx, int32_t end_idx) +{ + const float32_t scale = 1.702f; + for (uint32_t i = start_idx; i < end_idx; i++) + { + float32_t x = data_in[i]; + float32_t sigmoid_in = scale * x; + // sigmoid(z) = 1 / (1 + exp(-z)) + float32_t sigmoid = 1.0f / (1.0f + expf(-sigmoid_in)); + data_out[i] = x * sigmoid; + } +} \ No newline at end of file diff --git a/TargetLibraries/Generic/src/MatMul_fp32.c b/TargetLibraries/Generic/src/MatMul_fp32.c index 1d704b8517..3ef26a6054 100644 --- a/TargetLibraries/Generic/src/MatMul_fp32.c +++ b/TargetLibraries/Generic/src/MatMul_fp32.c @@ -29,20 +29,93 @@ #include "DeeployBasicMath.h" void MatMul_fp32_fp32_fp32(const float32_t *__restrict__ pSrcA, - const float32_t *__restrict__ pSrcB, - float32_t *__restrict__ pDstY, - uint32_t M, - uint32_t N, - uint32_t O) { - - - for (uint32_t i = 0; i < M; ++i) { - for (uint32_t j = 0; j < O; ++j) { + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, + uint32_t M, + uint32_t N, + uint32_t O) +{ + + for (uint32_t i = 0; i < M; ++i) + { + for (uint32_t j = 0; j < O; ++j) + { float32_t sum = 0.0f; - for (uint32_t k = 0; k < N; ++k) { + for (uint32_t k = 0; k < N; ++k) + { sum += pSrcA[i * N + k] * pSrcB[k * O + j]; } pDstY[i * O + j] = sum; } } -} \ No newline at end of file +} + +void MatMul_fp32_fp32_fp32_unroll1x7(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, + uint32_t M, + uint32_t N, + uint32_t O) +{ + uint32_t i, j, k; + uint32_t O_block = O - (O % 7); + + for (i = 0; i < M; i++) + { + for (j = 0; j < O_block; j += 7) + { + float32_t sum0 = 0.0f; + float32_t sum1 = 0.0f; + float32_t sum2 = 0.0f; + float32_t sum3 = 0.0f; + float32_t sum4 = 0.0f; + float32_t sum5 = 0.0f; + float32_t sum6 = 0.0f; + + for (k = 0; k < N; k++) + { + float32_t a0 = pSrcA[i * N + k]; + + float32_t b0 = pSrcB[k * O + (j + 0)]; + float32_t b1 = pSrcB[k * O + (j + 1)]; + float32_t b2 = pSrcB[k * O + (j + 2)]; + float32_t b3 = pSrcB[k * O + (j + 3)]; + float32_t b4 = pSrcB[k * O + (j + 4)]; + float32_t b5 = pSrcB[k * O + (j + 5)]; + float32_t b6 = pSrcB[k * O + (j + 6)]; + + sum0 += a0 * b0; + sum1 += a0 * b1; + sum2 += a0 * b2; + sum3 += a0 * b3; + sum4 += a0 * b4; + sum5 += a0 * b5; + sum6 += a0 * b6; + } + + pDstY[i * O + (j + 0)] = sum0; + pDstY[i * O + (j + 1)] = sum1; + pDstY[i * O + (j + 2)] = sum2; + pDstY[i * O + (j + 3)] = sum3; + pDstY[i * O + (j + 4)] = sum4; + pDstY[i * O + (j + 5)] = sum5; + pDstY[i * O + (j + 6)] = sum6; + } + + for (j = O_block; j < O; j++) + { + float32_t sum = 0.0f; + + for (k = 0; k < N; k++) + { + float32_t a_val = pSrcA[i * N + k]; + float32_t b_val = pSrcB[k * O + j]; + float32_t prod = a_val * b_val; + sum += prod; + } + + pDstY[i * O + j] = sum; + } + } +} + diff --git a/TargetLibraries/Generic/src/Softmax_fp32.c b/TargetLibraries/Generic/src/Softmax_fp32.c index 5553f1e302..e9082c3333 100644 --- a/TargetLibraries/Generic/src/Softmax_fp32.c +++ b/TargetLibraries/Generic/src/Softmax_fp32.c @@ -41,12 +41,14 @@ void Softmax_fp32_fp32(float32_t* input, float32_t* output, int32_t size, int32_ } for (int i = 0; i < last_dim_length; i++) { - output[b * last_dim_length + i] = expf(input[b * last_dim_length + i] - max_val); + float32_t exp_val = input[b * last_dim_length + i] - max_val; + output[b * last_dim_length + i] = expf(exp_val); sum += output[b * last_dim_length + i]; } for (int i = 0; i < last_dim_length; i++) { - output[b * last_dim_length + i] /= sum; + float32_t sum_1 = 1/ sum; + output[b * last_dim_length + i] = output[b * last_dim_length + i] * sum_1; } } } diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt index d951c6bde4..7508962fb1 100644 --- a/TargetLibraries/PULPOpen/CMakeLists.txt +++ b/TargetLibraries/PULPOpen/CMakeLists.txt @@ -2,7 +2,7 @@ file(GLOB_RECURSE SOURCES "src/**" ) -if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka") +if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka" OR platform STREQUAL "Siracusa_w_redmule") include(cmake/pulp-sdk-siracusa.cmake) elseif(platform STREQUAL "PULPOpen") include(cmake/pulp-sdk-pulp-open.cmake) diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h new file mode 100644 index 0000000000..51c5097744 --- /dev/null +++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h @@ -0,0 +1,75 @@ + +/* ===================================================================== + * Title: Conv.h + * Description: + * + * $Date: 05.04.2025 + * + * ===================================================================== */ +/* + * Copyright (C) 2020 ETH Zurich and University of Bologna. + * + * Author: Run Wang, ETH Zurich + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + #include "DeeployPULPMath.h" + + void Conv2d_ChannelRange_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pSrcB, uint32_t F_subset, + uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ, + float32_t *__restrict__ pDstC, uint32_t F_total, uint32_t F_start, + uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right); + + + void Conv2d_Im2Col_ChannelRange_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, + uint32_t H, + uint32_t W, + uint32_t C, + const float32_t *__restrict__ pSrcB, + uint32_t F_subset, + uint32_t P, + uint32_t Q, + uint32_t SP, + uint32_t SQ, + float32_t *__restrict__ pDstC, + uint32_t F_total, + uint32_t F_start, + uint32_t pad_top, + uint32_t pad_bottom, + uint32_t pad_left, + uint32_t pad_right, + float32_t *__restrict__ pIm2ColBuffer); + + void Conv2d_Im2Col_fp32_fp32_fp32_HWC_8_Redmule( + const float32_t *__restrict__ pSrcA, + uint32_t H, + uint32_t W, + uint32_t C, + const float32_t *__restrict__ pSrcB, + uint32_t P, + uint32_t Q, + uint32_t SP, + uint32_t SQ, + float32_t *__restrict__ pDstC, + uint32_t F, + uint32_t pad_top, + uint32_t pad_bottom, + uint32_t pad_left, + uint32_t pad_right, + float32_t *__restrict__ pIm2ColBuffer); \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/inc/kernel/gelu.h b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h similarity index 54% rename from TargetLibraries/PULPOpen/inc/kernel/gelu.h rename to TargetLibraries/PULPOpen/inc/kernel/MaxPool.h index 390e7f9926..6d9dd6ea86 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/gelu.h +++ b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h @@ -1,15 +1,14 @@ - /* ===================================================================== - * Title: gelu.h + * Title: Maxpool.h * Description: * - * $Date: 28.01.2025 + * $Date: 05.04.2025 * * ===================================================================== */ /* * Copyright (C) 2020 ETH Zurich and University of Bologna. * - * Author: Moritz Scherer, ETH Zurich + * Author: Run Wang, ETH Zurich * * SPDX-License-Identifier: Apache-2.0 * @@ -28,4 +27,18 @@ #include "DeeployPULPMath.h" -void GELU_fp32_fp32(float32_t *data_in, float32_t *data_out, int32_t dataSize); \ No newline at end of file + +void MaxPool2d_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, + uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ, + float32_t *__restrict__ pDstC, + uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right); + + +void MaxPool2d_ChannelRange_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, + uint32_t W, uint32_t H, uint32_t C, + uint32_t Q, uint32_t P, uint32_t SQ, uint32_t SP, + float32_t *__restrict__ pDstC, + uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right, + uint32_t ch_start, uint32_t ch_count); \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/inc/kernel/gemv.h b/TargetLibraries/PULPOpen/inc/kernel/gemv.h index 214f8300ad..057e4a7a4d 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/gemv.h +++ b/TargetLibraries/PULPOpen/inc/kernel/gemv.h @@ -26,9 +26,19 @@ */ #include "stdint.h" +#include "DeeployPULPMath.h" void gemv_s8_s8_plp(int8_t *pIn, int8_t *pBias, int8_t *pOut, int8_t *pWeight, int32_t *pKappa, int32_t *pLambda, uint16_t out_mult, uint16_t out_shift, uint16_t dim_vec, uint16_t num_o_neurons, uint8_t flag_relu, uint8_t flag_batch_norm); + +void Gemm_fp32_fp32_fp32_fp32_Redmule( + const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + const float32_t *__restrict__ pBias, + float32_t *__restrict__ pDstY, + uint32_t M, + uint32_t N, + uint32_t O); \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/Convolution_fp32.c b/TargetLibraries/PULPOpen/src/Convolution_fp32.c index ab804b43a9..88a8706f49 100644 --- a/TargetLibraries/PULPOpen/src/Convolution_fp32.c +++ b/TargetLibraries/PULPOpen/src/Convolution_fp32.c @@ -30,11 +30,11 @@ #include "DeeployPULPMath.h" #include "pmsis.h" -void Conv2d_fp32_fp32_fp32_HWC( +void Conv2d_ChannelRange_fp32_fp32_fp32_HWC( const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, - const float32_t *__restrict__ pSrcB, uint32_t F, + const float32_t *__restrict__ pSrcB, uint32_t F_subset, uint32_t P, uint32_t Q, uint32_t SP, uint32_t SQ, - float32_t *__restrict__ pDstC, + float32_t *__restrict__ pDstC, uint32_t F_total, uint32_t F_start, uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right) { @@ -48,7 +48,7 @@ void Conv2d_fp32_fp32_fp32_HWC( for (h = 0; h < H_out; ++h) { for (w = 0; w < W_out; ++w) { - for (f = 0; f < F; ++f) { + for (f = 0; f < F_subset; ++f) { float32_t sum = 0.0f; for (p = 0; p < P; ++p) { @@ -73,9 +73,182 @@ void Conv2d_fp32_fp32_fp32_HWC( } } - uint32_t output_idx = (h * W_out + w) * F + f; + uint32_t output_idx = (h * W_out + w) * F_total + (F_start + f); pDstC[output_idx] = sum; } } } +} + +void Conv2d_Im2Col_ChannelRange_fp32_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, + uint32_t H, + uint32_t W, + uint32_t C, + const float32_t *__restrict__ pSrcB, + uint32_t F_subset, + uint32_t P, + uint32_t Q, + uint32_t SP, + uint32_t SQ, + float32_t *__restrict__ pDstC, + uint32_t F_total, + uint32_t F_start, + uint32_t pad_top, + uint32_t pad_bottom, + uint32_t pad_left, + uint32_t pad_right, + float32_t *__restrict__ pIm2ColBuffer) +{ + + uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; + uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; + + uint32_t kernel_size = P * Q * C; + + for (uint32_t h_out = 0; h_out < H_out; h_out++) + { + for (uint32_t w_out = 0; w_out < W_out; w_out++) + { + + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + float32_t *pIm2Col = pIm2ColBuffer; + + for (uint32_t p = 0; p < P; p++) + { + int32_t h_in = h_in_start + p; + + for (uint32_t q = 0; q < Q; q++) + { + int32_t w_in = w_in_start + q; + + for (uint32_t c = 0; c < C; c++) + { + if (h_in >= 0 && h_in < H && w_in >= 0 && w_in < W) + { + + uint32_t in_idx = (h_in * W + w_in) * C + c; + pIm2Col[p * Q * C + q * C + c] = pSrcA[in_idx]; + } + else + { + pIm2Col[p * Q * C + q * C + c] = 0.0f; + } + } + } + } + + for (uint32_t f = 0; f < F_subset; f++) + { + float32_t sum = 0.0f; + + const float32_t *weight_ptr = pSrcB + f * kernel_size; + + for (uint32_t k = 0; k < kernel_size; k++) + { + sum += pIm2Col[k] * weight_ptr[k]; + } + + uint32_t out_idx = (h_out * W_out + w_out) * F_total + (F_start + f); + pDstC[out_idx] = sum; + } + } + } +} + +void Conv2d_Im2Col_fp32_fp32_fp32_HWC_8_Redmule( + const float32_t *__restrict__ pSrcA, + uint32_t H, + uint32_t W, + uint32_t C, + const float32_t *__restrict__ pSrcB, + uint32_t P, + uint32_t Q, + uint32_t SP, + uint32_t SQ, + float32_t *__restrict__ pDstC, + uint32_t F, + uint32_t pad_top, + uint32_t pad_bottom, + uint32_t pad_left, + uint32_t pad_right, + float32_t *__restrict__ pIm2ColBuffer) { + + uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; + uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; + uint32_t kernel_size = P * Q * C; + uint32_t core_id = pi_core_id(); + uint32_t num_cores = NUM_CORES; + + uint32_t total_positions = H_out * W_out; + uint32_t num_batches = (total_positions + num_cores - 1) / num_cores; + + float32_t *core_im2col_buffer = pIm2ColBuffer + core_id * kernel_size; + + for (uint32_t batch = 0; batch < num_batches; batch++) { + + uint32_t batch_start_pos = batch * num_cores; + + + uint32_t valid_cores = MIN(num_cores, total_positions - batch_start_pos); + + + if (core_id < valid_cores) { + + uint32_t pos = batch_start_pos + core_id; + + + uint32_t h_out = pos / W_out; + uint32_t w_out = pos % W_out; + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + + for (uint32_t p = 0; p < P; p++) { + int32_t h_in = h_in_start + p; + + for (uint32_t q = 0; q < Q; q++) { + int32_t w_in = w_in_start + q; + uint32_t in_offset = (h_in * W + w_in) * C; + uint32_t kernel_offset = (p * Q + q) * C; + + if (h_in >= 0 && h_in < H && w_in >= 0 && w_in < W) { + + for (uint32_t c = 0; c < C; c++) { + core_im2col_buffer[kernel_offset + c] = pSrcA[in_offset + c]; + } + } + else { + + for (uint32_t c = 0; c < C; c++) { + core_im2col_buffer[kernel_offset + c] = 0.0f; + } + } + + } + } + } + + + pi_cl_team_barrier(); + + + if (core_id == 0) { + + float32_t *batch_output = pDstC + batch_start_pos * F; + + MatMul_fp32_fp32_fp32_Redmule( + pIm2ColBuffer, + pSrcB, + batch_output, + valid_cores, + kernel_size, + F + ); + } + + pi_cl_team_barrier(); + } } \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c new file mode 100644 index 0000000000..e042cea132 --- /dev/null +++ b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c @@ -0,0 +1,139 @@ +#include "DeeployBasicMath.h" + + +#define REDMULE_BASE_ADDR 0x10201C00 + +#define REG_MNK_M 0x00 +#define REG_MNK_N 0x04 +#define REG_MNK_K 0x08 +#define REG_X_ADDR 0x0C +#define REG_Y_ADDR 0x10 +#define REG_Z_ADDR 0x14 +#define REG_W_ADDR 0x18 +#define REG_COMPUTE_MODE 0x1C +#define REG_TRIGGER 0x20 +#define REG_WAIT 0x28 + +void MatMul_fp32_fp32_fp32_Redmule( + const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, + uint32_t M, + uint32_t N, + uint32_t O) { + + uint32_t total_elements = M * O; + for (uint32_t i = 0; i < total_elements; i++) { + pDstY[i] = 0.0f; + } + + volatile uint16_t *mnk_m = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M); + volatile uint16_t *mnk_n = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N); + volatile uint16_t *mnk_k = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K); + + *mnk_m = (uint16_t)M; + *mnk_n = (uint16_t)N; + *mnk_k = (uint16_t)O; + + volatile uint32_t *x_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR); + volatile uint32_t *y_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR); + volatile uint32_t *z_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR); + volatile uint32_t *w_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR); + + *x_addr = (uint32_t)((uintptr_t)pSrcA); + *y_addr = (uint32_t)((uintptr_t)pDstY); + *z_addr = (uint32_t)((uintptr_t)pDstY); + *w_addr = (uint32_t)((uintptr_t)pSrcB); + + volatile uint32_t *compute_mode = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE); + *compute_mode = 4; // FP32 mode + + volatile uint32_t *trigger = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER); + *trigger; + + volatile uint32_t *wait_reg = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT); + uint32_t result = *wait_reg; +} + +void MatMul_fp32_fp32_fp32_Redmule_Async( + const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, + uint32_t M, + uint32_t N, + uint32_t O) { + + uint32_t total_elements = M * O; + for (uint32_t i = 0; i < total_elements; i++) { + pDstY[i] = 0.0f; + } + + + volatile uint16_t *mnk_m = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M); + volatile uint16_t *mnk_n = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N); + volatile uint16_t *mnk_k = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K); + + *mnk_m = (uint16_t)M; + *mnk_n = (uint16_t)N; + *mnk_k = (uint16_t)O; + + volatile uint32_t *x_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR); + volatile uint32_t *y_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR); + volatile uint32_t *z_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR); + volatile uint32_t *w_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR); + + *x_addr = (uint32_t)((uintptr_t)pSrcA); + *y_addr = (uint32_t)((uintptr_t)pDstY); + *z_addr = (uint32_t)((uintptr_t)pDstY); + *w_addr = (uint32_t)((uintptr_t)pSrcB); + + volatile uint32_t *compute_mode = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE); + *compute_mode = 4; // FP32 mode + + volatile uint32_t *trigger = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER); + *trigger; // Trigger without waiting +} + +uint32_t MatMul_fp32_fp32_fp32_Redmule_Wait() { + volatile uint32_t *wait_reg = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT); + return *wait_reg; +} + +void Gemm_fp32_fp32_fp32_fp32_Redmule( + const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + const float32_t *__restrict__ pBias, + float32_t *__restrict__ pDstY, + uint32_t M, + uint32_t N, + uint32_t O) { + + + volatile uint16_t *mnk_m = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M); + volatile uint16_t *mnk_n = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N); + volatile uint16_t *mnk_k = (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K); + + *mnk_m = (uint16_t)M; + *mnk_n = (uint16_t)N; + *mnk_k = (uint16_t)O; + + + volatile uint32_t *x_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR); + volatile uint32_t *y_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR); + volatile uint32_t *z_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR); + volatile uint32_t *w_addr = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR); + + *x_addr = (uint32_t)((uintptr_t)pSrcA); + *y_addr = (uint32_t)((uintptr_t)pBias); + *z_addr = (uint32_t)((uintptr_t)pDstY); + *w_addr = (uint32_t)((uintptr_t)pSrcB); + + volatile uint32_t *compute_mode = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE); + *compute_mode = 4; // FP32 mode + + volatile uint32_t *trigger = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER); + *trigger; + + volatile uint32_t *wait_reg = (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT); + uint32_t result = *wait_reg; +} diff --git a/TargetLibraries/PULPOpen/src/MaxPool_fp32.c b/TargetLibraries/PULPOpen/src/MaxPool_fp32.c index fe8afcde12..05f86674b0 100644 --- a/TargetLibraries/PULPOpen/src/MaxPool_fp32.c +++ b/TargetLibraries/PULPOpen/src/MaxPool_fp32.c @@ -72,4 +72,61 @@ void MaxPool2d_fp32_fp32_HWC( } } } +} + + +void MaxPool2d_ChannelRange_fp32_fp32_HWC( + const float32_t *__restrict__ pSrcA, + uint32_t W, uint32_t H, uint32_t C, + uint32_t Q, uint32_t P, uint32_t SQ, uint32_t SP, + float32_t *__restrict__ pDstC, + uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right, + uint32_t ch_start, uint32_t ch_count) { + + uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; + uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; + + uint32_t ch_end = ch_start + ch_count; + + if (ch_end > C) { + ch_end = C; + } + + for (uint32_t h_out = 0; h_out < H_out; ++h_out) { + for (uint32_t w_out = 0; w_out < W_out; ++w_out) { + + for (uint32_t c = ch_start; c < ch_end; ++c) { + float32_t max_val = -inf; + + int32_t h_in_start = h_out * SP - pad_top; + int32_t w_in_start = w_out * SQ - pad_left; + + for (uint32_t p = 0; p < P; ++p) { + int32_t h_in = h_in_start + p; + + if (h_in < 0 || h_in >= H) { + continue; + } + + for (uint32_t q = 0; q < Q; ++q) { + int32_t w_in = w_in_start + q; + + if (w_in < 0 || w_in >= W) { + continue; + } + + uint32_t input_idx = (h_in * W + w_in) * C + c; + float32_t val = pSrcA[input_idx]; + + if (val > max_val) { + max_val = val; + } + } + } + + uint32_t output_idx = (h_out * W_out + w_out) * C + c; + pDstC[output_idx] = max_val; + } + } + } } \ No newline at end of file diff --git a/TargetLibraries/PULPOpen/src/iGELU.c b/TargetLibraries/PULPOpen/src/iGELU.c deleted file mode 100644 index f6be595b42..0000000000 --- a/TargetLibraries/PULPOpen/src/iGELU.c +++ /dev/null @@ -1,33 +0,0 @@ -/* ===================================================================== - * Title: iGELU.c - * Description: - * - * $Date: 13.11.2023 - * - * ===================================================================== */ -/* - * Copyright (C) 2020 ETH Zurich and University of Bologna. - * - * Author: Moritz Scherer, ETH Zurich - * - * SPDX-License-Identifier: Apache-2.0 - * - * Licensed under the Apache License, Version 2.0 (the License); you may - * not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an AS IS BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "DeeployPULPMath.h" - -void PULPiGELU_s8_s8(int8_t *data_in, int8_t *data_out, int32_t dataSize, - int8_t b, int16_t one, int32_t input_offset, - int32_t output_offset, int32_t *mul, int32_t *add, - int32_t *shift) {} diff --git a/cmake/pulp/toolchain_llvm.cmake b/cmake/pulp/toolchain_llvm.cmake index cabfe6915d..76109b8941 100644 --- a/cmake/pulp/toolchain_llvm.cmake +++ b/cmake/pulp/toolchain_llvm.cmake @@ -19,6 +19,7 @@ set(CMAKE_EXECUTABLE_SUFFIX ".elf") add_compile_options( -target riscv32-unknown-elf -march=${ISA} + -mabi=ilp32f -ffunction-sections -fdata-sections -fomit-frame-pointer @@ -27,7 +28,7 @@ add_compile_options( -DNUM_CORES=${NUM_CORES} -MMD -MP - --sysroot=${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv/rv32imc + --sysroot=${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv/rv32imf -fno-builtin-memcpy -fno-builtin-memset ) @@ -38,8 +39,9 @@ add_link_options( -MP -nostartfiles -march=${ISA} - -L${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv/rv32imc/lib - -L${TOOLCHAIN_INSTALL_DIR}/lib/clang/15.0.0/lib/baremetal/rv32imc/ + -mabi=ilp32f + -L${TOOLCHAIN_INSTALL_DIR}/picolibc/riscv/rv32imf/lib + -L${TOOLCHAIN_INSTALL_DIR}/lib/clang/15.0.0/lib/baremetal/rv32imf/ -z norelro -fno-builtin-memcpy -fno-builtin-memset @@ -50,4 +52,4 @@ link_libraries( ) add_compile_definitions(__LINK_LD) -add_compile_definitions(__TOOLCHAIN_LLVM__) +add_compile_definitions(__TOOLCHAIN_LLVM__) \ No newline at end of file diff --git a/toolchain/meson-build-script-rv32imf.txt b/toolchain/meson-build-script-rv32imf.txt new file mode 100644 index 0000000000..2db512dadf --- /dev/null +++ b/toolchain/meson-build-script-rv32imf.txt @@ -0,0 +1,19 @@ +[binaries] +c = ['clang', '-target', 'riscv32-unknown-elf', '-march=rv32imc', '-nostdlib'] +ar = 'llvm-ar' +strip = 'llvm-strip' + +[host_machine] +system = 'none' +cpu_family = 'riscv32' +cpu = 'riscv32' +endian = 'little' + +[properties] +c_args = ['-Werror=double-promotion', '-Wno-unsupported-floating-point-opt', '-fshort-enums', '-mno-relax'] +c_link_args = ['-Wl,-z,noexecstack'] +skip_sanity_check = true +default_flash_addr = '0x00000000' +default_flash_size = '0x00400000' +default_ram_addr = '0x20000000' +default_ram_size = '0x00200000'