diff --git a/.github/workflows/_runner-siracusa-redmule-tiled.yml b/.github/workflows/_runner-siracusa-redmule-tiled.yml new file mode 100644 index 00000000..8bf5265d --- /dev/null +++ b/.github/workflows/_runner-siracusa-redmule-tiled.yml @@ -0,0 +1,161 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: _runner-siracusa-redmule-tiled-sequential + +"on": + workflow_call: + inputs: + runner: + required: true + type: string + docker-image: + required: true + type: string + pytest-marker: + required: true + type: string + # Extra flags injected into the pytest command, between -v and the -m + # marker filter. Default keeps the original 4-worker xdist behavior; + # callers that want simulator stdout (e.g. GVSoC cycle counts) in the + # CI log can override with "-s -p no:xdist" to disable capture and + # the parallel worker plugin (xdist eats per-test stdout). + pytest-flags: + required: false + type: string + default: "-n 4" + +jobs: + test-runner-siracusa-redmule-tiled: + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.docker-image }} + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' + - name: Checkout Repo + uses: actions/checkout@v4 + with: + submodules: recursive + - name: Build Deeploy + shell: bash + run: pip install -e . + - name: Run Test + run: | + cd DeeployTest + mkdir -p /app/.ccache + export CCACHE_DIR=/app/.ccache + set -o pipefail + pytest test_platforms.py -v ${{ inputs.pytest-flags }} -m "siracusa_redmule_tiled and ${{ inputs.pytest-marker }}" 2>&1 | tee /tmp/pytest_out.log + shell: bash + - name: Report cycle counts (RedMulE side, with speedup vs Siracusa) + if: always() + shell: bash + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HEAD_SHA: ${{ github.event.pull_request.head.sha || github.sha }} + REPO: ${{ github.repository }} + MARKER: ${{ inputs.pytest-marker }} + run: | + python3 - <<'PY' + import json, os, re, sys, urllib.request, pathlib + LOG_PATH = "/tmp/pytest_out.log" + PAT = re.compile(r'^BENCH train_cycles=(\d+) opt_cycles=(\d+) weight_sram=(\d+)') + + if not pathlib.Path(LOG_PATH).exists(): + print("no pytest log found; skipping") + sys.exit(0) + + # 1. parse RedMulE side's BENCH lines (one per training model) + rmu = [] + with open(LOG_PATH) as fh: + for line in fh: + m = PAT.search(line) + if m: + rmu.append({ + 'train': int(m.group(1)), 'opt': int(m.group(2)), + 'sram': int(m.group(3))}) + if not rmu: + print("No BENCH line in pytest output (kernel-only job?). Skipping summary.") + sys.exit(0) + + out = [] + marker = os.environ.get('MARKER', '?') + sha = os.environ.get('HEAD_SHA', '')[:7] + out.append(f"## Siracusa + RedMulE cycles ({marker})") + out.append("") + out.append("| weight_sram | train_cycles | opt_cycles |") + out.append("|---:|---:|---:|") + for r in rmu: + out.append(f"| {r['sram']:,} | {r['train']:,} | {r['opt']:,} |") + out.append("") + out.append(f"_Counted on commit `{sha}` via GVSoC._") + + # 2. best-effort: find Siracusa baseline on same SHA, build speedup table + repo = os.environ.get('REPO', '') + head_sha = os.environ.get('HEAD_SHA', '') + tok = os.environ.get('GH_TOKEN', '') + + def gh(url): + req = urllib.request.Request(url, headers={'Authorization': f'bearer {tok}'}) + with urllib.request.urlopen(req, timeout=20) as r: + return r.read() + + try: + runs = json.loads(gh( + f"https://api.github.com/repos/{repo}/actions/runs" + f"?head_sha={head_sha}&per_page=30")) + base_run_id = next( + (r['id'] for r in runs.get('workflow_runs', []) + if r['name'] == 'CI • Siracusa (Tiled)' and r['event'] == 'push'), + None) + if base_run_id is None: + out += ["", "_No matching `Siracusa (Tiled)` push run on this SHA — speedup diff skipped._"] + else: + jobs = json.loads(gh( + f"https://api.github.com/repos/{repo}/actions/runs/{base_run_id}/jobs")) + base_job_id = next( + (j['id'] for j in jobs.get('jobs', []) + if 'training' in j['name'].lower() + and 'l3' in j['name'].lower() + and j.get('conclusion') == 'success'), + None) + if base_job_id is None: + out += ["", "_Siracusa training-L3 baseline job not finished/green yet — speedup diff skipped._"] + else: + txt = gh(f"https://api.github.com/repos/{repo}/actions/jobs/{base_job_id}/logs").decode('utf-8','replace') + base = {} + for line in txt.splitlines(): + m = PAT.search(line) + if m: + base[int(m.group(3))] = { + 'train': int(m.group(1)), + 'opt': int(m.group(2))} + out += ["", "## Speedup vs Siracusa baseline (matched by weight_sram)", ""] + out += ["| weight_sram | Siracusa train | + RedMulE train | sptrain | Siracusa opt | + RedMulE opt | spopt |"] + out += ["|---:|---:|---:|:---:|---:|---:|:---:|"] + for r in rmu: + b = base.get(r['sram']) + if b is None: + out.append(f"| {r['sram']:,} | — | {r['train']:,} | _no match_ | — | {r['opt']:,} | — |") + else: + st = b['train'] / r['train'] + so = b['opt'] / r['opt'] + out.append( + f"| {r['sram']:,} | {b['train']:,} | {r['train']:,} | **{st:.3f}×** " + f"| {b['opt']:,} | {r['opt']:,} | **{so:.3f}×** |") + except Exception as e: + out += ["", f"_Baseline lookup failed: `{type(e).__name__}: {e}` — RedMulE numbers above are still valid._"] + + text = "\n".join(out) + "\n" + print(text) + sp = os.environ.get('GITHUB_STEP_SUMMARY') + if sp: + with open(sp, 'a') as f: + f.write(text) + PY diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml index cc09f234..3e9ecaa1 100644 --- a/.github/workflows/_runner-siracusa-tiled.yml +++ b/.github/workflows/_runner-siracusa-tiled.yml @@ -17,6 +17,14 @@ name: _runner-siracusa-tiled pytest-marker: required: true type: string + # Extra flags injected into the pytest command (between -v and the -m + # marker filter). Default empty preserves the existing sequential + # invocation; callers that want simulator stdout (e.g. GVSoC cycle + # counts) in the CI log can override with "-s" to disable capture. + pytest-flags: + required: false + type: string + default: "" jobs: test-runner-siracusa-tiled: @@ -36,5 +44,28 @@ jobs: - name: Run Test run: | cd DeeployTest - pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}" + set -o pipefail + pytest test_platforms.py -v ${{ inputs.pytest-flags }} -m "siracusa_tiled and ${{ inputs.pytest-marker }}" 2>&1 | tee /tmp/pytest_out.log + shell: bash + - name: Report cycle counts (Siracusa baseline) + if: always() shell: bash + run: | + # Emit every BENCH line from the test stdout into the run summary so + # the RedMulE-side workflow can diff against these numbers for the + # same SHA. Non-training jobs (kernel-only matrices) produce no + # BENCH lines and the step is a quiet no-op. + if ! grep -q '^BENCH train_cycles=' /tmp/pytest_out.log 2>/dev/null; then + echo "No BENCH line found (probably a kernel-only job); skipping summary." + exit 0 + fi + echo "## Siracusa baseline training cycles" >> "$GITHUB_STEP_SUMMARY" + echo "" >> "$GITHUB_STEP_SUMMARY" + echo "| model (weight_sram) | train_cycles | opt_cycles |" >> "$GITHUB_STEP_SUMMARY" + echo "|---|---:|---:|" >> "$GITHUB_STEP_SUMMARY" + grep '^BENCH train_cycles=' /tmp/pytest_out.log | while read -r line; do + tc=$(echo "$line" | sed -nE 's/.*train_cycles=([0-9]+).*/\1/p') + oc=$(echo "$line" | sed -nE 's/.*opt_cycles=([0-9]+).*/\1/p') + ws=$(echo "$line" | sed -nE 's/.*weight_sram=([0-9]+).*/\1/p') + echo "| weight_sram=${ws} | ${tc} | ${oc} |" >> "$GITHUB_STEP_SUMMARY" + done diff --git a/.github/workflows/ci-platform-gap9-tiled.yml b/.github/workflows/ci-platform-gap9-tiled.yml index 6823344a..44b8d17d 100644 --- a/.github/workflows/ci-platform-gap9-tiled.yml +++ b/.github/workflows/ci-platform-gap9-tiled.yml @@ -21,12 +21,16 @@ concurrency: cancel-in-progress: true jobs: + # GAP9 CI requires access to the private ghcr.io/pulp-platform/deeploy-gap9 + # image; gate on upstream org so forks skip cleanly. select-env: + if: github.repository_owner == 'pulp-platform' uses: ./.github/workflows/_select-env.yml with: docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/runwangdl/deeploy:gap9' }} gap9-kernels-tiled-singlebuffer-L2: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9-tiled.yml with: @@ -35,6 +39,7 @@ jobs: pytest-markers: "gap9_tiled and kernels and singlebuffer and l2" gap9-kernels-tiled-doublebuffer-L2: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9-tiled.yml with: @@ -43,6 +48,7 @@ jobs: pytest-markers: "gap9_tiled and kernels and doublebuffer and l2" gap9-models-tiled-singlebuffer-L2: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9-tiled.yml with: @@ -51,6 +57,7 @@ jobs: pytest-markers: "gap9_tiled and models and singlebuffer and l2" gap9-models-tiled-doublebuffer-L2: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9-tiled.yml with: diff --git a/.github/workflows/ci-platform-gap9.yml b/.github/workflows/ci-platform-gap9.yml index d3bf829a..e2cf26d3 100644 --- a/.github/workflows/ci-platform-gap9.yml +++ b/.github/workflows/ci-platform-gap9.yml @@ -22,12 +22,16 @@ concurrency: cancel-in-progress: true jobs: + # GAP9 CI requires access to the private ghcr.io/pulp-platform/deeploy-gap9 + # image; gate on upstream org so forks skip cleanly. select-env: + if: github.repository_owner == 'pulp-platform' uses: ./.github/workflows/_select-env.yml with: docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/runwangdl/deeploy:gap9' }} gap9-kernels: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9.yml with: @@ -36,6 +40,7 @@ jobs: pytest-marker: "kernels" gap9-models: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9.yml with: diff --git a/.github/workflows/ci-platform-siracusa-redmule-tiled.yml b/.github/workflows/ci-platform-siracusa-redmule-tiled.yml new file mode 100644 index 00000000..c0f25e9c --- /dev/null +++ b/.github/workflows/ci-platform-siracusa-redmule-tiled.yml @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: CI • Siracusa + RedMulE (Tiled) + +"on": + push: + branches: + - "**" + tags: + - "v*.*.*" + pull_request: + workflow_dispatch: + inputs: + docker_image_deeploy: + description: "Deeploy Image to use" + required: false + default: "ghcr.io/runwangdl/deeploy:redmule" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + select-env: + uses: ./.github/workflows/_select-env.yml + with: + # RedMulE CI needs the fork's custom Docker image that bundles a + # GVSoC build with the light_redmule model. Fall back to + # runwangdl/deeploy:redmule on push/PR events (when no input is + # provided) rather than the upstream devel image. + docker_image_deeploy: ${{ inputs.docker_image_deeploy || 'ghcr.io/runwangdl/deeploy:redmule' }} + + siracusa-redmule-kernels-tiled-singlebuffer-L2: + needs: select-env + uses: ./.github/workflows/_runner-siracusa-redmule-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: "kernels and singlebuffer and l2" + + siracusa-redmule-kernels-tiled-doublebuffer-L2: + needs: select-env + uses: ./.github/workflows/_runner-siracusa-redmule-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: "kernels and doublebuffer and l2" + + siracusa-redmule-training-tiled-singlebuffer-L3: + needs: select-env + uses: ./.github/workflows/_runner-siracusa-redmule-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: "training and singlebuffer and l3" + # Disable pytest's stdout capture so GVSoC's "Cycles" report from the + # cct_train simulation lands in the CI log; needs -p no:xdist because + # the parallel worker plugin would otherwise re-buffer stdout. Only + # one test case in this matrix anyway, so dropping -n 4 is harmless. + pytest-flags: "-s -p no:xdist" diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml index b65cbb75..69916ee4 100644 --- a/.github/workflows/ci-platform-siracusa-tiled.yml +++ b/.github/workflows/ci-platform-siracusa-tiled.yml @@ -46,3 +46,7 @@ jobs: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} pytest-marker: "training and l3 and singlebuffer" + # -s makes GVSoC's per-test "Cycles" report visible in the CI log, + # so cct_train cycle counts on plain Siracusa can be diffed against + # the Siracusa+RedMulE run for an apples-to-apples speedup number. + pytest-flags: "-s" diff --git a/.github/workflows/infra-generate-documentation.yml b/.github/workflows/infra-generate-documentation.yml index 84508113..b3036d53 100644 --- a/.github/workflows/infra-generate-documentation.yml +++ b/.github/workflows/infra-generate-documentation.yml @@ -28,12 +28,12 @@ jobs: sphinx-build docs _build - name: Prepare Multipages uses: xeratec/gh-pages-multibranch@pr/support_tags - if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} + if: ${{ (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && github.repository_owner == 'pulp-platform' }} with: directory: _build - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 - if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch'}} + if: ${{ (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && github.repository_owner == 'pulp-platform' }} with: publish_branch: gh-pages github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e07d64a..3bafd225 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() -set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)") -set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch) +set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, Siracusa_w_redmule, PULP-Open, GAP9, Generic, Snitch)") +set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka Siracusa_w_redmule PULP-Open GAP9 Generic Snitch) if(platform STREQUAL MemPool) message(STATUS "Building for platform 'MemPool'") @@ -31,6 +31,8 @@ elseif(platform STREQUAL Siracusa) message(STATUS "Building for platform 'Siracusa'") elseif(platform STREQUAL Siracusa_w_neureka) message(STATUS "Building for platform 'Siracusa_w_neureka'") +elseif(platform STREQUAL Siracusa_w_redmule) + message(STATUS "Building for platform 'Siracusa_w_redmule'") elseif(platform STREQUAL PULPOpen) message(STATUS "Building for platform 'PULP-Open'") elseif(platform STREQUAL GAP9) @@ -196,7 +198,7 @@ if(platform STREQUAL QEMU-ARM) endif() -if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL PULPOpen) +if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule OR platform STREQUAL PULPOpen) if(TOOLCHAIN STREQUAL LLVM) set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/toolchain_llvm.cmake) @@ -206,7 +208,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp.cmake) - if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka) + if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule) include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/siracusa/siracusa.cmake) elseif(platform STREQUAL PULPOpen) include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp-open/pulp-open.cmake) diff --git a/Deeploy/Targets/Chimera/__init__.py b/Deeploy/Targets/Chimera/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py index 79770fe6..da553857 100644 --- a/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py +++ b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py @@ -24,15 +24,32 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw inputBufferName = parseDict['data_in'] outputBufferName = parseDict['data_out'] + inputShape = ctxt.lookup(inputBufferName).shape + outputShape = ctxt.lookup(outputBufferName).shape + perm = parseDict["perm"] + + # Spatial-view interpretation of the perm: it operates on the last + # len(perm) dims of data_in and the last len(perm) dims of data_out. + # MatMulLayer.computeShapes can left-pad the rank of one side without + # touching the other when the same gs.Variable is shared between a + # broadening (MatMul) and a non-broadening (Gemm/Transpose) consumer, + # so the constraint indexing must offset by the per-side leading-batch + # depth rather than assume rank == len(perm) == rank_other. When all + # ranks already match, offsets are 0 and behavior is unchanged. + inputOffset = len(inputShape) - len(perm) + outputOffset = len(outputShape) - len(perm) + assert inputOffset >= 0 and outputOffset >= 0, (f"Transpose perm {perm} is longer than tensor ranks " + f"data_in={inputShape}, data_out={outputShape}") + # Add I/O dimensions to the model as variables for bufferName in [inputBufferName, outputBufferName]: tilerModel.addTensorDimToModel(ctxt, bufferName) - # Map output dims to inputs dims - for idx, perm_idx in enumerate(parseDict["perm"]): + # Map output spatial dims to input spatial dims via perm. + for idx, perm_idx in enumerate(perm): tilerModel.addConstraint( - tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx) == tilerModel.getTensorDimVar( - tensorName = inputBufferName, dimIdx = perm_idx)) + tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = outputOffset + idx) == + tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = inputOffset + perm_idx)) return tilerModel @@ -50,7 +67,10 @@ def serializeTilingSolution( replacementTypes = {} replacements: Dict[str, List[int]] = {} - numDims = len(ctxt.lookup(operatorRepresentation['data_in']).shape) + # Match the spatial-view interpretation in addGeometricalConstraint: + # only the last len(perm) dims of data_in are actually transposed, + # so emit exactly len(perm) dimLen_ replacement variables. + numDims = len(operatorRepresentation['perm']) for dim in range(numDims): replacementTypes[f"dimLen_{dim}"] = PointerClass(uint16_t) diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py index ef046f19..ea0e880a 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py @@ -8,6 +8,8 @@ from Deeploy.CommonExtensions.DataTypes import float32_t from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +float32_tPtr = PointerClass(float32_t) + class PULPFloatGEMMTemplate(NodeTemplate): diff --git a/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py index 64143a9d..bf4ca1d2 100644 --- a/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py @@ -65,16 +65,27 @@ def alignToContext(self, ctxt: NetworkContext, fRep['accessStr'] = accessStr fRep['data_out_shape'] = data_out_shape - parallelDims = [idx for idx, dim in enumerate(data_out_shape) if dim >= 8] + # Spatial-view: perm targets the last len(perm) dims of data_in. When + # data_in has been left-padded (e.g. by MatMulLayer.computeShapes + # broadening a shared upstream Transpose output), offset the + # data_in_shape lookup so dimLen_ reflects the actual + # transposed dim rather than a leading batch placeholder. Same + # for data_out_shape -- parallelDim must index within the spatial + # view since the per-tile for-loop count comes from len(perm). + dataInOffset = len(data_in_shape) - len(perm) + dataOutOffset = len(data_out_shape) - len(perm) + spatialOutShape = list(data_out_shape[dataOutOffset:]) + + parallelDims = [idx for idx, dim in enumerate(spatialOutShape) if dim >= 8] if len(parallelDims) > 0: parallelDim = parallelDims[0] else: - parallelDim = data_out_shape.index(max(data_out_shape)) + parallelDim = spatialOutShape.index(max(spatialOutShape)) forLoops = [] dimLenPtrs = [] for idx, i in enumerate(perm): - operatorRepresentation[f"dimLen_{idx}"] = data_in_shape[idx] + operatorRepresentation[f"dimLen_{idx}"] = data_in_shape[dataInOffset + idx] dimLenPtrs.append(f"dimLen_{idx}") if idx != parallelDim: forLoops.append(_forLoop.generate({"i": i, "dimLenPtr": f"dimLen_{i}"})) diff --git a/Deeploy/Targets/Redmule/Bindings.py b/Deeploy/Targets/Redmule/Bindings.py new file mode 100644 index 00000000..3017f4e8 --- /dev/null +++ b/Deeploy/Targets/Redmule/Bindings.py @@ -0,0 +1,66 @@ +# ---------------------------------------------------------------------- +# +# File: NeurekaBindings.py +# +# Last edited: 10.07.2024 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: +# Luka Macan, University of Bologna +# Moritz Scherer, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t +from Deeploy.DeeployTypes import NodeBinding +from Deeploy.Targets.Generic.TypeCheckers import ConvChecker, GEMMChecker, MatMulChecker +from Deeploy.Targets.PULPOpen.Bindings import ClusterTransformer, ForkTransformer +from Deeploy.Targets.Redmule.Templates import ConvGradTemplate, ConvTemplate, GEMMTemplate, MatmulTemplate + +RedmuleMatmulBindings = [ + NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + MatmulTemplate.referenceTemplate, ForkTransformer) +] + +RedmuleConv2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), ConvTemplate.reference2DIm2ColTemplate, + ForkTransformer) +] + +RedmuleGEMMBindings = [ + NodeBinding( + GEMMChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), GEMMTemplate.referenceTemplate, + ForkTransformer) +] + +# Pointwise (1x1) ConvGradW / ConvGradX routed to RedMulE. The PULP versions +# (PULPFloatPWConvGradW2DBindings / PULPFloatPWConvGradX2DBindings) use the +# same ConvChecker signature, so the binding is identical apart from which +# template -> kernel symbol is selected. +RedmulePWConvGradW2DBindings = [ + NodeBinding(ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConvGradTemplate.referencePWConvGradW2DTemplate, ClusterTransformer) +] + +RedmulePWConvGradX2DBindings = [ + NodeBinding(ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConvGradTemplate.referencePWConvGradX2DTemplate, ClusterTransformer) +] diff --git a/Deeploy/Targets/Redmule/Deployer.py b/Deeploy/Targets/Redmule/Deployer.py new file mode 100644 index 00000000..89ba2b92 --- /dev/null +++ b/Deeploy/Targets/Redmule/Deployer.py @@ -0,0 +1,59 @@ +# ---------------------------------------------------------------------- +# +# File: Deployer.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Dict, Type + +import onnx_graphsurgeon as gs + +from Deeploy.AbstractDataTypes import Pointer +from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer +from Deeploy.Targets.Redmule.TopologyOptimizationPasses.Passes import RedMuleGEMMTransposePass + + +class RedmuleDeployer(PULPDeployer): + + def __init__(self, + graph: gs.Graph, + deploymentPlatform: DeploymentPlatform, + inputTypes: Dict[str, Type[Pointer]], + loweringOptimizer: TopologyOptimizer, + scheduler: Callable = lambda graph: list(graph.nodes), + name: str = 'DeeployNetwork', + default_channels_first = False, + deeployStateDir: str = "DeeployStateDir", + inputOffsets = {}): + super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name, + default_channels_first, deeployStateDir, inputOffsets) + + self.loweringOptimizer.passes += [ + # RedMuleAdjustWeightMemoryLayoutPass is currently not registered: + # it transposes Conv weights from [F,P,Q,C] to [P,Q,C,F] for the + # RedMulE accelerator, but Conv is back on PULPClusterEngine (see + # Engine.RedmuleMapping for why) and PULP expects [F,P,Q,C]. + # Restore alongside the Conv mapping when RedmuleConv2DTileConstraint + # learns spatial tiling. + RedMuleGEMMTransposePass("Redmule") + ] diff --git a/Deeploy/Targets/Redmule/Engine.py b/Deeploy/Targets/Redmule/Engine.py new file mode 100644 index 00000000..9b929ab4 --- /dev/null +++ b/Deeploy/Targets/Redmule/Engine.py @@ -0,0 +1,99 @@ +# ---------------------------------------------------------------------- +# +# File: Engine.py +# +# Last edited: 26.07.2024 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Moritz Scherer, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper +from Deeploy.Targets.Generic.Layers import ConvGradWLayer, ConvGradXLayer, ConvLayer, GEMMLayer, MatMulLayer +from Deeploy.Targets.Generic.Parsers import MatMulParser +from Deeploy.Targets.PULPOpen.Parsers import PULPFPConv2DParser, PULPPWConvGradW2DParser, PULPPWConvGradX2DParser +from Deeploy.Targets.PULPOpen.Platform import ConvGradWMapper as PULPConvGradWMapper, \ + ConvGradXMapper as PULPConvGradXMapper, DwConvGradWMapper, DwConvGradxMapper +from Deeploy.Targets.Redmule.Parsers import GEMMRedmuleParser +from Deeploy.Targets.Redmule.Tiler import RedmuleConvTilingReadyBindings, RedmuleGEMMTilingReadyBindings, \ + RedmuleMatMulTilingReadyBindings, RedmulePWConvGradW2DTilingReadyBindings, RedmulePWConvGradX2DTilingReadyBindings + +MatMulRedmuleMapper = NodeMapper(MatMulParser(), RedmuleMatMulTilingReadyBindings) +Conv2DRedmuleMapper = NodeMapper(PULPFPConv2DParser(), RedmuleConvTilingReadyBindings) +GEMMMRedmuleMapper = NodeMapper(GEMMRedmuleParser(noBiasHoisting = False), RedmuleGEMMTilingReadyBindings) +# Pointwise (1x1) ConvGradW / ConvGradX reuse PULP's parsers verbatim -- +# they only screen for kernel_shape=[1,1] / group=1 and populate the same +# operatorRepresentation keys our Redmule templates consume. +PWConvGradW2DRedmuleMapper = NodeMapper(PULPPWConvGradW2DParser(), RedmulePWConvGradW2DTilingReadyBindings) +PWConvGradX2DRedmuleMapper = NodeMapper(PULPPWConvGradX2DParser(), RedmulePWConvGradX2DTilingReadyBindings) + +RedmuleMapping = { + 'MatMul': MatMulLayer([MatMulRedmuleMapper]), + # 'Conv' is currently routed to PULPClusterEngine (see comment below). + # The RedMulE-accelerated kernel and its template are kept in-tree + # (TargetLibraries/PULPOpen/src/Conv2d_Im2Col_fp32_Redmule.c + + # Deeploy/Targets/Redmule/Templates/ConvTemplate.py) so the path is + # ready to re-enable once RedmuleConv2DTileConstraint learns spatial + # tiling with halo regions. Today its addPolicyConstraint hard-pins + # inputHeightVar / inputWidthVar to the full feature-map size, which + # forces the entire activation tensor into L1 -- workable for tiny + # tokenizer-style Convs (CCT2 has 8x8 inputs and L1=128 KiB fits), + # but ResNet8 / MobileNet middle layers exceed L1 immediately + # (32x32x16 input + 32x32x16 output alone is 128 KiB). PULP's + # Conv2DTileConstraint already supports spatial halos, so falling + # back keeps the bigger Conv-heavy training fixtures tilable while + # MatMul / Gemm continue to bind to RedMulE. + # + # When that tile-constraint upgrade lands, restore: + # 'Conv': ConvLayer([Conv2DRedmuleMapper]), + # and the matching RedMuleAdjustWeightMemoryLayoutPass in Deployer.py. + 'Gemm': GEMMLayer([GEMMMRedmuleMapper]), + # NOTE: ConvGradW / ConvGradX are intentionally NOT mapped here. + # _selectEngine() is first-match across engines, so putting them on the + # RedmuleEngine would route every 3x3 / depthwise ConvGrad through this + # engine's layer and never let PULPClusterEngine see them. We tried a + # "complete" RedmuleEngine layer ([PW_Redmule, DW_PULP, regular_PULP]) + # but the resulting tiler hit infeasible memory-pattern constraints on + # ResNet8 / MobileNet despite using identical mapper instances to + # PULP -- some interaction between the layer object identity and the + # tiling-pattern solver we couldn't fully diagnose. + # + # Instead, the RedMulE PWConvGrad mappers are inserted into the + # existing PULPClusterEngine ConvGradW / ConvGradX layers at position 0 + # in RedmulePlatform.__init__. That keeps the layer object identical + # to the pure-PULP path (matters for the tiler) while still ensuring + # 1x1 ConvGrads bind to the RedMulE kernels. +} + +_includeList = [] + +_redmuleInitCode = r""" +// Redmule engine initialization +""" + + +class RedmuleEngine(DeploymentEngine): + + def __init__(self, + name: str, + Mapping = RedmuleMapping, + initCode: str = _redmuleInitCode, + includeList: List[str] = _includeList) -> None: + super().__init__(name, Mapping, initCode, includeList) diff --git a/Deeploy/Targets/Redmule/Parsers.py b/Deeploy/Targets/Redmule/Parsers.py new file mode 100644 index 00000000..d359bbbd --- /dev/null +++ b/Deeploy/Targets/Redmule/Parsers.py @@ -0,0 +1,114 @@ +# ---------------------------------------------------------------------- +# +# File: BasicParsers.py +# +# Last edited: 15.12.2021 +# +# Copyright (C) 2021, ETH Zurich and University of Bologna. +# +# Authors: +# - Moritz Scherer, ETH Zurich +# - Victor Jung, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import numpy as np +import onnx_graphsurgeon as gs + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t +from Deeploy.DeeployTypes import NetworkContext +from Deeploy.Targets.Generic.Parsers import MatMulParser + + +class GEMMRedmuleParser(MatMulParser): + + def __init__(self, noBiasHoisting = True): + # Order matters: super().__init__() of MatMulParser also writes + # self.noBiasHoisting from its own default, so call super first and + # then overwrite, otherwise our flag gets clobbered to True. + super().__init__(noBiasHoisting = noBiasHoisting) + self.noBiasHoisting = noBiasHoisting + + def parseNode(self, node: gs.Node) -> (bool): + + ret = all([len(node.inputs) >= 2, len(node.outputs) == 1, node.attrs['alpha'] == 1]) + + if ret: + if 'transA' in node.attrs: + self.operatorRepresentation['transA'] = node.attrs['transA'] + else: + self.operatorRepresentation['transA'] = 0 + + if 'transB' in node.attrs: + self.operatorRepresentation['transB'] = node.attrs['transB'] + else: + self.operatorRepresentation['transB'] = 0 + if 'alpha' in node.attrs: + self.operatorRepresentation['alpha'] = node.attrs['alpha'] + else: + self.operatorRepresentation['alpha'] = 1 + if 'beta' in node.attrs: + self.operatorRepresentation['beta'] = node.attrs['beta'] + else: + self.operatorRepresentation['beta'] = 1 + + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + inputs = ['A', 'B'] + outputs = ['data_out'] + + for idx, inputNode in enumerate(node.inputs): + if idx < len(inputs): + self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name + for idx, outputNode in enumerate(node.outputs): + self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name + + if len(node.inputs) == 3: + self.operatorRepresentation['C'] = newCtxt.lookup(node.inputs[2].name).name + elif not self.noBiasHoisting: + # Hoist a zero C tensor whose shape matches the GEMM output, so + # the bias-required RedmuleGEMMTileConstraint and the existing + # 3-operand kernel template can run unchanged on bias-less + # Gemm nodes (e.g. backward GradFusedMatMul rewrites in CCT + # training graphs that emit Y = A @ B with no C). + outShape = node.outputs[0].shape + values = np.zeros(outShape, dtype = np.float32) + zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values) + newCtxt.hoistConstant(zeroTensor, _type = PointerClass(float32_t)) + # Also wire the hoisted Constant into the gs.Node inputs so the + # tiler picks it up via its `node.inputs + node.outputs` walk, + # AND register the Gemm as a user of the new buffer so the + # MemoryConstraintFlow's kill-set analysis (which walks + # `_users`) can find a consumer for it. Without these the + # tiler / flow analyzer KeyError or assert on the C tensor. + node.inputs.append(zeroTensor) + newCtxt.addUser(f'{node.name}_C_Tensor', node) + self.operatorRepresentation['C'] = f'{node.name}_C_Tensor' + + self.operatorRepresentation['size'] = np.prod(newCtxt.lookup(node.inputs[0].name).shape) + + return newCtxt, ret diff --git a/Deeploy/Targets/Redmule/Platform.py b/Deeploy/Targets/Redmule/Platform.py new file mode 100644 index 00000000..8906b6d2 --- /dev/null +++ b/Deeploy/Targets/Redmule/Platform.py @@ -0,0 +1,71 @@ +# ---------------------------------------------------------------------- +# +# File: Platform.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.DeeployTypes import TopologyOptimizer +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPConstantBuffer, PULPOptimizer, PULPPlatform, \ + PULPStructBuffer, PULPTransientBuffer, PULPVariableBuffer +from Deeploy.Targets.Redmule.Engine import PWConvGradW2DRedmuleMapper, PWConvGradX2DRedmuleMapper, RedmuleEngine + +RedmuleOptimizer = TopologyOptimizer([*PULPOptimizer.passes]) + + +class RedmulePlatform(PULPPlatform): + + def __init__(self, + engines = [RedmuleEngine("Redmule"), PULPClusterEngine("PULPCluster")], + variableBuffer = PULPVariableBuffer, + constantBuffer = PULPConstantBuffer, + structBuffer = PULPStructBuffer, + transientBuffer = PULPTransientBuffer) -> None: + super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) + + # Insert the RedMulE PWConvGrad mappers at position 0 of the + # PULPClusterEngine's ConvGradW / ConvGradX layer mapper lists. + # See the comment on RedmuleMapping in Engine.py: we cannot route + # those op types through RedmuleEngine itself without confusing the + # tiler, so we mutate the (still-pure-PULP) PULPClusterEngine layer + # in place. Order matters: PW Redmule must come before PULP's PW + # mapper, so 1x1 ConvGrads bind to the RedMulE kernel; non-PW + # variants fall through to PULP's DW / regular mappers as before. + pulp_cluster = next((e for e in self.engines if e.name == "PULPCluster"), None) + if pulp_cluster is not None: + # Both PWConvGradW and PWConvGradX RedMulE mappers are hooked up + # to PULPCluster's existing layer mapper lists. ConvGradW was + # disabled temporarily in 68d1639 because its template sized the + # transpose buffer at C_in * H_in * W_in, which over-counted the + # actual footprint for stride > 1 1x1 convs (ResNet8 layer2/3 + # downsample) and tripped tiler infeasibility on the regular-Conv + # backward pattern memory. After dropping that to the exact + # C_in * H_out * W_out and teaching the kernel to sample X at + # strided positions, the W path is back in. + for op_type, redmule_mapper in ( + ("ConvGradW", PWConvGradW2DRedmuleMapper), + ("ConvGradX", PWConvGradX2DRedmuleMapper), + ): + layer_factory = pulp_cluster.Mapping.get(op_type) + if layer_factory is not None and hasattr(layer_factory, "maps"): + # Avoid double-inserting across repeated platform inits. + if redmule_mapper not in layer_factory.maps: + layer_factory.maps.insert(0, redmule_mapper) diff --git a/Deeploy/Targets/Redmule/Templates/ConvGradTemplate.py b/Deeploy/Targets/Redmule/Templates/ConvGradTemplate.py new file mode 100644 index 00000000..b2246cdb --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/ConvGradTemplate.py @@ -0,0 +1,148 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Templates that route Pointwise (1x1) ConvGradW / ConvGradX to RedMulE. + +Both kernels reuse the existing PULPOpen tile constraints +(PWConvGradWTileConstraint / PWConvGradXTileConstraint) so the tile-shape +search is identical to the pulp-trainlib variants; only the kernel body +calls into PWConvGrad_fp32_Redmule.c instead, which materialises the +necessary transpose into a transient buffer and then fires a single +RedMulE GEMM. +""" + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class RedmulePWConvGradWTemplate(NodeTemplate): + """RedMulE pointwise ConvGradW: dW = dY @ X^T (1x1 kernel). + + Reserves a C_in * H_in * W_in transient buffer in L1 to hold the + transposed input that the RedMulE GEMM consumes. At the kernel side + PWConvGradW2d_fp32_fp32_fp32_CHW_Redmule (in + TargetLibraries/PULPOpen/src/PWConvGrad_fp32_Redmule.c) builds the + transpose in parallel across the 8 cluster cores and then triggers + one RedMulE call. + """ + + def __init__(self, templateStr: str): + super().__init__(templateStr) + + # Must stay in sync with PWGW_CHUNK_P in PWConvGrad_fp32_Redmule.c. + PWGW_CHUNK_P = 16 + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + # Fixed-size chunk scratch: PWGW_CHUNK_P rows of [C_in] for the + # X-sampled-and-transposed slice + PWGW_CHUNK_P rows of [C_out] for + # the dY view (used by the multi-chunk path when P > CHUNK_P). + # Independent of the layer's feature-map area -- crucial on + # MobileNetV1 early blocks where H_out * W_out can hit 48*48 and a + # full transpose buffer would blow L1. + wbytes = operatorRepresentation["data_in_type"].typeWidth // 8 + chunk = RedmulePWConvGradWTemplate.PWGW_CHUNK_P + bt_dim = wbytes * chunk * (operatorRepresentation['ch_im_in'] + + operatorRepresentation['ch_im_out']) + bt_name = operatorRepresentation['nodeName'] + "_transpose_buffer" + return [(bt_name, bt_dim)] + + def hoistTransientBuffers( + self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + bt_name, bt_dim = RedmulePWConvGradWTemplate.computeTransientBuffersSize( + ctxt, operatorRepresentation)[0] + ctxt.hoistTransientBuffer(bt_name, bt_dim) + operatorRepresentation['transposeBuffer'] = bt_name + operatorRepresentation['transposeBufferSize'] = bt_dim + return ctxt, operatorRepresentation, [bt_name] + + +class RedmulePWConvGradXTemplate(NodeTemplate): + """RedMulE pointwise ConvGradX: dX = scatter(W^T @ dY) (1x1 kernel). + + For stride 1 the transpose buffer only holds C_in * C_out floats (the + transposed weight matrix); the RedMulE GEMM writes the [C_in, H*W] + result straight into pGradIn. + + For stride > 1 the GEMM output is the *dense* [C_in, H_out * W_out] + matrix and must be scattered into the [C_in, H_in, W_in] dX tensor at + the strided positions (the rest of dX stays zero). In that case the + transpose buffer is also reused to hold the dense GEMM result, so the + template reserves C_in * C_out + C_in * H_out * W_out floats. At + stride 1 the dense buffer is unused but the over-allocation is small + enough to keep the worst-case size simple. + """ + + def __init__(self, templateStr: str): + super().__init__(templateStr) + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + wt_elts = operatorRepresentation['ch_im_in'] * operatorRepresentation['ch_im_out'] + dense_elts = operatorRepresentation['ch_im_in'] * operatorRepresentation[ + 'dim_im_out_x'] * operatorRepresentation['dim_im_out_y'] + bt_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * (wt_elts + dense_elts) + bt_name = operatorRepresentation['nodeName'] + "_transpose_buffer" + return [(bt_name, bt_dim)] + + def hoistTransientBuffers( + self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + bt_name, bt_dim = RedmulePWConvGradXTemplate.computeTransientBuffersSize( + ctxt, operatorRepresentation)[0] + ctxt.hoistTransientBuffer(bt_name, bt_dim) + operatorRepresentation['transposeBuffer'] = bt_name + operatorRepresentation['transposeBufferSize'] = bt_dim + return ctxt, operatorRepresentation, [bt_name] + + +referencePWConvGradW2DTemplate = RedmulePWConvGradWTemplate(""" +// 2D FP Pointwise ConvGradW (1x1) CHW via RedMulE (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_${grad_weight}_${grad_out} = ${grad_out}; +${data_in_type.typeName} ref_${grad_weight}_${data_in} = ${data_in}; +${grad_weight_type.typeName} ref_${grad_weight}_out = ${grad_weight}; + +for (uint32_t n = 0; n < ${batch}; ++n) { + PWConvGradW2d_fp${grad_out_type.referencedType.typeWidth}_fp${data_in_type.referencedType.typeWidth}_fp${grad_weight_type.referencedType.typeWidth}_CHW_Redmule( + ref_${grad_weight}_${grad_out}, + ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out}, + ref_${grad_weight}_${data_in}, + ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in}, + ref_${grad_weight}_out, + ${transposeBuffer} + ); + + ref_${grad_weight}_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x}; + ref_${grad_weight}_${data_in} += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x}; +} +""") + +referencePWConvGradX2DTemplate = RedmulePWConvGradXTemplate(""" +// 2D FP Pointwise ConvGradX (1x1) CHW via RedMulE (Name: ${nodeName}, Op: ${nodeOp}) +${grad_out_type.typeName} ref_${grad_in}_${grad_out} = ${grad_out}; +${weight_type.typeName} ref_${grad_in}_${weight} = ${weight}; +${grad_in_type.typeName} ref_${grad_in}_out = ${grad_in}; + +for (uint32_t n = 0; n < ${batch}; ++n) { + PWConvGradX2d_fp${grad_out_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${grad_in_type.referencedType.typeWidth}_CHW_Redmule( + ref_${grad_in}_${grad_out}, + ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out}, + ref_${grad_in}_${weight}, + ${ch_im_in}, + ref_${grad_in}_out, + ${dim_im_in_x}, ${dim_im_in_y}, + ${transposeBuffer}, ${transposeBufferSize} + ); + + ref_${grad_in}_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x}; + ref_${grad_in}_out += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x}; +} +""") diff --git a/Deeploy/Targets/Redmule/Templates/ConvTemplate.py b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py new file mode 100644 index 00000000..3ce9d61e --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py @@ -0,0 +1,98 @@ +# ---------------------------------------------------------------------- +# +# File: ConvTemplate.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class RedmuleFloatConvIm2ColTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + # Streaming im2col buffer: IM2COL_CHUNK_ROWS rows of K = C*P*Q FP32 + # values. Must stay in sync with the IM2COL_CHUNK_ROWS macro in + # Conv2d_Im2Col_fp32_Redmule.c. A full-image im2col would blow L1 + # for non-trivial Conv layers (e.g. ResNet8 with H_out*W_out=1024 + # and K=144 -> 576 KiB), which made the tiler infeasible; capping + # the buffer at 16 rows keeps every Conv layer tilable, at the cost + # of a few extra RedMulE MMIO triggers per layer. + IM2COL_CHUNK_ROWS = 16 + k_per_row = (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] * + operatorRepresentation['dim_kernel_y']) + im2col_dim = 4 * IM2COL_CHUNK_ROWS * k_per_row + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + return [(im2col_name, im2col_dim)] + + def hoistTransientBuffers(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + im2col_name, im2col_dim = RedmuleFloatConvIm2ColTemplate.computeTransientBuffersSize( + ctxt, operatorRepresentation)[0] + ctxt.hoistTransientBuffer(im2col_name, im2col_dim) + + operatorRepresentation['ctxtBuffer'] = im2col_name + operatorRepresentation['ctxtBufferSize'] = im2col_dim + return ctxt, operatorRepresentation, [im2col_name] + + +reference2DIm2ColTemplate = RedmuleFloatConvIm2ColTemplate(""" +// 2D FP Conv HWC Parallel with Im2Col (Name: ${nodeName}, Op: ${nodeOp}) +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + + Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC_8_Redmule( + ref_${data_out}_${data_in}, + ${dim_im_in_y}, + ${dim_im_in_x}, + ${ch_im_in}, + ${weight}, + ${dim_kernel_y}, + ${dim_kernel_x}, + ${stride_y}, + ${stride_x}, + ${bias}, + ${has_bias}, + ref_${data_out}_${data_out}, + ${ch_im_out}, + ${padding_y_top}, + ${padding_y_bottom}, + ${padding_x_left}, + ${padding_x_right}, + ${ctxtBuffer} + ); + + ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; +} +""") diff --git a/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py new file mode 100644 index 00000000..ba41ab76 --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py @@ -0,0 +1,61 @@ +# ---------------------------------------------------------------------- +# +# File: MatMul.py.py +# +# Last edited: 27.01.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the Licens +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// GEMM using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp}) + +int8_t ${nodeName}_core_id = pi_core_id(); + +if (${nodeName}_core_id == 0) { + for(uint32_t b=0; b<${batch}; b++) { + ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N}; + ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O}; + ${C_type.typeName} batch_C = ${C} + b * ${M} * ${O}; + ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O}; + + % if beta == 0: + MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule( + (const float32_t *) batch_A, + (const float32_t *) batch_B, + (float32_t *) batch_out, + ${M}, + ${N}, + ${O} + ); + % else: + Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule( + (const float32_t *) batch_A, + (const float32_t *) batch_B, + (const float32_t *) batch_C, + (float32_t *) batch_out, + ${M}, + ${N}, + ${O} + ); + % endif + } +} +""") diff --git a/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py new file mode 100644 index 00000000..cb077ca8 --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py @@ -0,0 +1,49 @@ +# ---------------------------------------------------------------------- +# +# File: MatMul.py.py +# +# Last edited: 27.01.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the Licens +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// Matmul using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp}) + +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_num_cores = NUM_CORES; + +if (${nodeName}_core_id == 0) { + for(uint32_t b=0; b<${batch}; b++) { + ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N}; + ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O}; + ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O}; + + MatMul_fp32_fp32_fp32_Redmule( + (const float32_t *) batch_A, + (const float32_t *) batch_B, + (float32_t *) batch_out, + ${M}, + ${N}, + ${O} + ); + } +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/Templates/__init__.py b/Deeploy/Targets/Redmule/Templates/__init__.py new file mode 100644 index 00000000..a73187ca --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/__init__.py @@ -0,0 +1,26 @@ +# ---------------------------------------------------------------------- +# +# File: __init__.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import * diff --git a/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py new file mode 100644 index 00000000..1b3a93f6 --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py @@ -0,0 +1,279 @@ +# ---------------------------------------------------------------------- +# +# File: ConvTileConstraint.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: +# - Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class RedmuleConv2DTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + inputBufferName = parseDict['data_in'] + weightBufferName = parseDict['weight'] + outputBufferName = parseDict['data_out'] + + strides = parseDict["strides"] + padding = parseDict["pads"] + dilation = parseDict["dilations"] + + # Add I/O dimensions to the model as variables + for bufferName in [inputBufferName, weightBufferName, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0) + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3) + + # Updated dimension indexes for (H, W, Cin, Cout) format + weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0) + weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1) + weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2) + weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3) + + outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0) + outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1) + outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) + outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) + + # Map output dims to inputs dims + tilerModel.addConstraint(outputBatchVar == inputBatchVar) # Batch + tilerModel.addConstraint(outputChannelVar == weightOutChannelVar) # Output Channel (now at index 3) + + inputBuffer = ctxt.lookup(inputBufferName) + + effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1])) + effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2])) + + tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (weightHeightVar - 1) - 1) // strides[0] + 1)) + tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (weightWidthVar - 1) - 1) // strides[1] + 1)) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + inputBuffer = ctxt.lookup(name = parseDict['data_in']) + weightBuffer = ctxt.lookup(name = parseDict['weight']) + + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3) + + # Updated dimension indexes for (H, W, Cin, Cout) format + weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0) + weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1) + weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2) + weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 3) + + strides = parseDict["strides"] + padding = parseDict["pads"] + + tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in']) + # RW: Conv only tiled on outchannel + tilerModel.addConstraint(inputHeightVar == parseDict['dim_im_in_x']) + tilerModel.addConstraint(inputWidthVar == parseDict['dim_im_in_y']) + tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in']) + + tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x']) + tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y']) + tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in']) + + outChannel = parseDict["ch_im_out"] + if outChannel >= 12: + tilerModel.addTileSizeDivisibleConstraint(parseDict, + "ch_im_out", + weightOutChannelVar, + 12, + strategy = PerformanceHint(priority = 1)) + else: + tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max(), strategy = PerformanceHint(1)) + + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + inputBuffer = ctxt.lookup(name = parseDict['data_in']) + weightBuffer = ctxt.lookup(name = parseDict['weight']) + + symbolicParseDict = parseDict.copy() + symbolicParseDict['dim_im_in_x'] = tilerModel.getTensorDimVar(inputBuffer.name, 1) + # Using updated dimension indexes for kernel dimensions + symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 0) + symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 1) + + return symbolicParseDict + + @staticmethod + def computeMargins(kernelShape: Tuple[int, ...]) -> Tuple[int, ...]: + if kernelShape[1] % 2 == 0: + leftMargin = 0 + rightMargin = 0 + else: + leftMargin = ((kernelShape[1]) // 2) + rightMargin = ((kernelShape[1]) // 2) + + if kernelShape[0] % 2 == 0: + topMargin = 0 + bottomMargin = 0 + else: + topMargin = ((kernelShape[0]) // 2) + bottomMargin = ((kernelShape[0]) // 2) + + return leftMargin, rightMargin, topMargin, bottomMargin + + @staticmethod + def computeInputCube(kernelShape: Tuple[int, ...], pads: Tuple[int, ...], strides: Tuple[int, ...], + weightChannels: int, outputCube: HyperRectangle, + outputDims: Tuple[int, ...]) -> Tuple[HyperRectangle, Tuple[int, ...]]: + + (BatchOffset, HOffset, WOffset, COffset) = outputCube.offset + (BatchSize, HSize, WSize, CSize) = outputCube.dims + + leftMargin, rightMargin, topMargin, bottomMargin = RedmuleConv2DTileConstraint.computeMargins(kernelShape) + + padding_top = (HOffset == 0) * pads[0] + padding_bottom = (HOffset + HSize == outputDims[1]) * pads[2] + + padding_left = (WOffset == 0) * pads[1] + padding_right = (WOffset + WSize == outputDims[2]) * pads[3] + + inputHOffset = HOffset * strides[0] - topMargin * (HOffset != 0) + inputWOffset = WOffset * strides[1] - leftMargin * (WOffset != 0) + + inputHSize = HSize * strides[0] + (topMargin + bottomMargin) - (padding_top + padding_bottom) + inputWSize = WSize * strides[1] + (leftMargin + rightMargin) - (padding_left + padding_right) + + InCube = HyperRectangle((BatchOffset, inputHOffset, inputWOffset, 0), + (BatchSize, inputHSize, inputWSize, weightChannels)) + + return InCube, (padding_left, padding_right, padding_top, padding_bottom) + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'weight', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + varWeight = operatorRepresentation['weight'] + varOut = operatorRepresentation['data_out'] + + inputInCubes = [] + inputWeightCubes = [] + replacements: Dict[str, List[int]] = { + "dim_im_in_x": [], + "dim_im_in_y": [], + "dim_im_out_x": [], + "dim_im_out_y": [], + "ch_im_out": [], + "padding_y_top": [], + "padding_y_bottom": [], + "padding_x_left": [], + "padding_x_right": [] + } + + replacementTypes = { + "dim_im_in_x": PointerClass(uint16_t), + "dim_im_in_y": PointerClass(uint16_t), + "dim_im_out_x": PointerClass(uint16_t), + "dim_im_out_y": PointerClass(uint16_t), + "ch_im_out": PointerClass(uint16_t), + "padding_y_top": PointerClass(uint8_t), + "padding_y_bottom": PointerClass(uint8_t), + "padding_x_left": PointerClass(uint8_t), + "padding_x_right": PointerClass(uint8_t) + } + + # Updated dimension indexes for (H, W, Cin, Cout) format + weightH = ctxt.lookup(varWeight).shape[0] # Now index 0 + weightW = ctxt.lookup(varWeight).shape[1] # Now index 1 + weightC = ctxt.lookup(varWeight).shape[2] # Now index 2 (Cin) + + pads = operatorRepresentation['pads'] + strides = operatorRepresentation['strides'] + + for cube in outputCubes: + (BatchOffset, HOffset, WOffset, COffset) = cube.offset + (BatchSize, HSize, WSize, CSize) = cube.dims + + InCube, padding_tuple = RedmuleConv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, + weightC, cube, + ctxt.lookup(varOut).shape) + + padding_left, padding_right, padding_top, padding_bottom = padding_tuple + + replacements['dim_im_in_x'].append(InCube.dims[1]) + replacements['dim_im_in_y'].append(InCube.dims[2]) + replacements['dim_im_out_x'].append(HSize) + replacements['dim_im_out_y'].append(WSize) + replacements['ch_im_out'].append(CSize) + + replacements['padding_y_top'].append(padding_top) + replacements['padding_y_bottom'].append(padding_bottom) + replacements['padding_x_left'].append(padding_left) + replacements['padding_x_right'].append(padding_right) + + inputInCubes.append(InCube) + + # Updated WeightCube for (H, W, Cin, Cout) format + # COffset is now applied to dimension 3 (Cout) + WeightCube = HyperRectangle((0, 0, 0, COffset), (weightH, weightW, weightC, CSize)) + + inputWeightCubes.append(WeightCube) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for a, b in zip(inputInCubes, inputWeightCubes): + inputLoadSchedule.append({"data_in": a, "weight": b}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py new file mode 100644 index 00000000..fbae4824 --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py @@ -0,0 +1,198 @@ +# ---------------------------------------------------------------------- +# +# File: GEMMTileConstraint.py +# +# Last edited: 02.06.2023 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: +# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich +# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class RedmuleGEMMTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + bufferC = ctxt.lookup(name = parseDict['C']) + outputBuffer = ctxt.lookup(name = parseDict['data_out']) + + # Add I/O dimensions to the model as variables + for bufferName in [bufferA.name, bufferB.name, bufferC.name, outputBuffer.name]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + dimOffsetA = len(bufferA.shape) - 2 + dimOffsetB = len(bufferB.shape) - 2 + dimOffsetC = len(bufferC.shape) - 2 + dimOffsetOut = len(outputBuffer.shape) - 2 + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = dimOffsetA + 1 - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = dimOffsetB + 1 - parseDict['transB']) + outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut) + outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut + 1) + + # Map output dims to inputs dims + tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar) + tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar) + + # Add GEMM Geometrical constraints + tilerModel.addConstraint(ASecondDimVar == BFirstDimVar) + + addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC) + addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1) + tilerModel.addConstraint(outputFirstDimVar == addDimVar_1) + tilerModel.addConstraint(outputSecondDimVar == addDimVar_2) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + from Deeploy.TilingExtension.TilerModel import PerformanceHint + + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + + tensorsShapeLen = min(len(bufferA.shape), len(bufferB.shape)) + + dimOffsetA = len(bufferA.shape) - 2 + dimOffsetB = len(bufferB.shape) - 2 + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = dimOffsetA + 1 - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = dimOffsetB + 1 - parseDict['transB']) + + # VIC: We don't want to deal with intermediate results between kernel calls + tilerModel.addConstraint(ASecondDimVar == parseDict['N']) + tilerModel.addConstraint(BFirstDimVar == parseDict['N']) + + tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy = PerformanceHint(1)) + tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1)) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['A', 'B', 'C', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + transA = operatorRepresentation['transA'] + transB = operatorRepresentation['transB'] + + varA = operatorRepresentation['A'] + varB = operatorRepresentation['B'] + + if transA == 0: + NSize = ctxt.lookup(varA).shape[-1] + else: + NSize = ctxt.lookup(varA).shape[-2] + + NOffset = 0 + + inputACubes = [] + inputBCubes = [] + inputAddCubes = [] + + replacements = {"M": [], "O": [], "batch": []} + + # Every output is constructed by a pair of inputs. Reconstruct this pair. + for cube in outputCubes: + + BSize = 1 + BOffset = 0 + BatchSize = 1 + BatchOffset = 0 + + if len(cube.offset) == 2: + (MOffset, OOffset) = cube.offset + (MSize, OSize) = cube.dims + elif len(cube.offset) == 3: + (BatchOffset, MOffset, OOffset) = cube.offset + (BatchSize, MSize, OSize) = cube.dims + else: + (BatchOffset, BOffset, MOffset, OOffset) = cube.offset + (BatchSize, BSize, MSize, OSize) = cube.dims + + replacements["M"].append(MSize) + replacements["O"].append(OSize) + replacements["batch"].append(BSize) + + if transA == 0: + ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize)) + else: + ACube = HyperRectangle((BatchOffset, BOffset, NOffset, MOffset), (BatchSize, BSize, NSize, MSize)) + + if transB == 0: + BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize)) + else: + BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize)) + + CCube = HyperRectangle(cube.offset, cube.dims) + + inputACubes.append(ACube) + inputBCubes.append(BCube) + inputAddCubes.append(CCube) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + replacements["N"] = [NSize] * len(outputCubes) + + replacementTypes = { + "M": PointerClass(uint16_t), + "N": PointerClass(uint16_t), + "O": PointerClass(uint16_t), + "batch": PointerClass(uint8_t) + } + + for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes): + inputLoadSchedule.append({"A": a, "B": b, "C": c}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + + return VariableReplacementScheme(replacements, replacementTypes), schedule diff --git a/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py new file mode 100644 index 00000000..1b14ccc4 --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py @@ -0,0 +1,197 @@ +# ---------------------------------------------------------------------- +# +# File: MatMulTileConstraint.py +# +# Last edited: 28.04.2025 +# +# Copyright (C) 2025, ETH Zurich and University of Bologna. +# +# Author: [Your Name] +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import int8_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class RedmuleMatmulTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + outputBuffer = ctxt.lookup(name = parseDict['data_out']) + + # Add I/O dimensions to the model as variables + for _buffer in [bufferA, bufferB, outputBuffer]: + tilerModel.addTensorDimToModel(ctxt, _buffer.name) + + tensorsShapeLen = len(bufferA.shape) + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) + outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2)) + outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1)) + + # Map output dims to inputs dims + for idx in range(tensorsShapeLen - 2): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( + tensorName = bufferA.name, dimIdx = idx)) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( + tensorName = bufferB.name, dimIdx = idx)) + + tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar) + tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar) + + # Add GEMM Geometrical constraints + tilerModel.addConstraint(ASecondDimVar == BFirstDimVar) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + + tensorsShapeLen = len(bufferA.shape) + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) + + # VIC: We don't want to deal with intermediate results between kernel calls + tilerModel.addConstraint(ASecondDimVar == parseDict['N']) + tilerModel.addConstraint(BFirstDimVar == parseDict['N']) + + # Hardware-specific constraints for 4x12 accelerator + tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1)) + + M_full_size = ctxt.lookup(bufferA.name).shape[(tensorsShapeLen - 2) + parseDict['transA']] + if M_full_size >= 16: + tilerModel.addTileSizeDivisibleConstraint(parseDict, + "M", + AFirstDimVar, + 16, + strategy = PerformanceHint(priority = 1)) + else: + tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy = PerformanceHint(1)) + + N_full_size = ctxt.lookup(bufferB.name).shape[(tensorsShapeLen - 2) + parseDict['transB']] + if N_full_size >= 12: + tilerModel.addTileSizeDivisibleConstraint(parseDict, + "O", + BSecondDimVar, + 12, + strategy = PerformanceHint(priority = 1)) + else: + tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1)) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['A', 'B', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + varA = operatorRepresentation['A'] + + NSize = ctxt.lookup(varA).shape[-1] + NOffset = 0 + + inputACubes = [] + inputBCubes = [] + + replacements = {"M": [], "O": [], "batch": []} + + # Every output is constructed by a pair of inputs. Reconstruct this pair. + for cube in outputCubes: + + BSize = 1 + BOffset = 0 + BatchSize = 1 + BatchOffset = 0 + + if len(cube.offset) == 2: + (MOffset, OOffset) = cube.offset + (MSize, OSize) = cube.dims + elif len(cube.offset) == 3: + (BatchOffset, MOffset, OOffset) = cube.offset + (BatchSize, MSize, OSize) = cube.dims + else: + (BatchOffset, BOffset, MOffset, OOffset) = cube.offset + (BatchSize, BSize, MSize, OSize) = cube.dims + + replacements["M"].append(MSize) + replacements["O"].append(OSize) + replacements["batch"].append(BSize) + + ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize)) + BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize)) + + inputACubes.append(ACube) + inputBCubes.append(BCube) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + replacements["N"] = [NSize] * len(outputCubes) + + replacementTypes = { + "M": PointerClass(int8_t), + "N": PointerClass(int8_t), + "O": PointerClass(int8_t), + "batch": PointerClass(int8_t) + } + + for a, b in zip(inputACubes, inputBCubes): + inputLoadSchedule.append({"A": a, "B": b}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + + return VariableReplacementScheme(replacements, replacementTypes), schedule diff --git a/Deeploy/Targets/Redmule/TileConstraints/__init__.py b/Deeploy/Targets/Redmule/TileConstraints/__init__.py new file mode 100644 index 00000000..a73187ca --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/__init__.py @@ -0,0 +1,26 @@ +# ---------------------------------------------------------------------- +# +# File: __init__.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import * diff --git a/Deeploy/Targets/Redmule/Tiler.py b/Deeploy/Targets/Redmule/Tiler.py new file mode 100644 index 00000000..5264c089 --- /dev/null +++ b/Deeploy/Targets/Redmule/Tiler.py @@ -0,0 +1,49 @@ +# ---------------------------------------------------------------------- +# +# File: Tiler.py +# +# Last edited: 26.07.2024 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Moritz Scherer, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.Targets.PULPOpen.TileConstraints.ConvGradConstraint import PWConvGradWTileConstraint, \ + PWConvGradXTileConstraint +from Deeploy.Targets.Redmule.Bindings import RedmuleConv2DBindings, RedmuleGEMMBindings, RedmuleMatmulBindings, \ + RedmulePWConvGradW2DBindings, RedmulePWConvGradX2DBindings +from Deeploy.Targets.Redmule.TileConstraints.ConvTileConstraint import RedmuleConv2DTileConstraint +from Deeploy.Targets.Redmule.TileConstraints.GEMMTileConstraint import RedmuleGEMMTileConstraint +from Deeploy.Targets.Redmule.TileConstraints.MatmulTileConstraint import RedmuleMatmulTileConstraint +from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings + +RedmuleMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleMatmulBindings, + tileConstraint = RedmuleMatmulTileConstraint()) +RedmuleConvTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleConv2DBindings, + tileConstraint = RedmuleConv2DTileConstraint()) +RedmuleGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleGEMMBindings, + tileConstraint = RedmuleGEMMTileConstraint()) + +# Reuse PULP's PWConvGradW / PWConvGradX tile constraints unchanged -- the +# tile-shape search depends only on the op semantics (1x1 conv backward), +# not on which engine ends up running the kernel. Only the binding body +# (= template + kernel) differs. +RedmulePWConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmulePWConvGradW2DBindings, + tileConstraint = PWConvGradWTileConstraint()) +RedmulePWConvGradX2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmulePWConvGradX2DBindings, + tileConstraint = PWConvGradXTileConstraint()) diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py new file mode 100644 index 00000000..f9d3d95b --- /dev/null +++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py @@ -0,0 +1,150 @@ +# ---------------------------------------------------------------------- +# +# File: RedMulePasses.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2025, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import onnx_graphsurgeon as gs + +from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match +from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ + _appendTranspose + + +def _redmule_weight_layout_fun(graph: gs.Graph, match: Match, name: str): + """Convert Conv weights from [cout, h, w, cin] to [h,w,cin, cout] for RedMule accelerator""" + node = list(match.nodes_map.values())[0] + + weightTensor = node.inputs[1] + if isinstance(weightTensor, gs.Constant): + weightTensor.values = np.transpose(weightTensor.values, (1, 2, 3, 0)) + + return graph + + +@contextagnostic +class RedMuleAdjustWeightMemoryLayoutPass(ReplaceSequentialPatternPass): + """Pass to convert Conv weights from [cout, h, w, cin] to [hwcin, cout] for RedMule accelerator""" + + def __init__(self, redmuleEngineName: str): + graph = gs.Graph() + _input = gs.Variable(name = 'input_1') + output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'Conv', name = 'conv') + graph.outputs.append(output) + graph.inputs.append(_input) + + super().__init__(graph, _redmule_weight_layout_fun, "_REDMULE_ADJUST_WEIGHT_MEMORY_LAYOUT_PASS") + + +def _redmule_gemm_transpose_fun(graph: gs.Graph, match: Match, name: str): + """ + Handle GEMM transA and transB attributes for RedMule accelerator + + Properly handles tensors of any dimensionality, ensuring only the last two + dimensions are transposed when needed. + """ + matched_nodes = [m for k, m in match.nodes_map.items()] + gemm_node = matched_nodes[0] + + if 'transA' not in gemm_node.attrs: + gemm_node.attrs['transA'] = 0 + if 'transB' not in gemm_node.attrs: + gemm_node.attrs['transB'] = 0 + if 'alpha' not in gemm_node.attrs: + gemm_node.attrs['alpha'] = 1.0 + if 'beta' not in gemm_node.attrs: + gemm_node.attrs['beta'] = 1.0 + + inputA = gemm_node.inputs[0] + inputB = gemm_node.inputs[1] + + if gemm_node.attrs['transA'] != 0: + if isinstance(inputA, gs.Constant): + print(f"Physical transpose for constant A: {inputA.name}") + + if len(inputA.values.shape) > 2: + perm = list(range(len(inputA.values.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + inputA.values = np.transpose(inputA.values, perm) + else: + inputA.values = np.transpose(inputA.values) + + gemm_node.attrs['transA'] = 0 + else: + + perm = list(range(len(inputA.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + + anchorTransposeNode = _appendTranspose(inputA, gemm_node, perm) + gemm_node.attrs['transA'] = 0 + graph.nodes.append(anchorTransposeNode) + + if gemm_node.attrs['transB'] != 0: + if isinstance(inputB, gs.Constant): + + if len(inputB.values.shape) > 2: + + perm = list(range(len(inputB.values.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + + inputB.values = np.transpose(inputB.values, perm) + else: + inputB.values = np.transpose(inputB.values) + + gemm_node.attrs['transB'] = 0 + else: + print(f"Adding transpose node for variable B: {inputB.name}") + + perm = list(range(len(inputB.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + + anchorTransposeNode = _appendTranspose(inputB, gemm_node, perm) + gemm_node.attrs['transB'] = 0 + graph.nodes.append(anchorTransposeNode) + + return graph + + +@contextagnostic +class RedMuleGEMMTransposePass(ReplaceSequentialPatternPass): + """Pass to handle GEMM transA and transB attributes for RedMule accelerator""" + + def __init__(self, redmuleEngineName: str): + + pattern = gs.Graph() + + input_a = gs.Variable(name = "input_a") + input_b = gs.Variable(name = "input_b") + + gemm_output = pattern.layer(op = "Gemm", + name = "gemm_node", + inputs = [input_a, input_b], + outputs = ["gemm_output"]) + + pattern.inputs = [input_a, input_b] + pattern.outputs = [gemm_output] + + super().__init__(pattern = pattern, + replacement_fn = _redmule_gemm_transpose_fun, + name = "_REDMULE_GEMM_TRANSPOSE_PASS") diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py new file mode 100644 index 00000000..63063b60 --- /dev/null +++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py @@ -0,0 +1,26 @@ +# ---------------------------------------------------------------------- +# +# File: __init__.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import * diff --git a/Deeploy/Targets/Redmule/__init__.py b/Deeploy/Targets/Redmule/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/Deeploy/Targets/SoftHier/__init__.py b/Deeploy/Targets/SoftHier/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt index 3d6480d5..9dd0bb65 100644 --- a/DeeployTest/CMakeLists.txt +++ b/DeeployTest/CMakeLists.txt @@ -57,7 +57,7 @@ elseif(DEEPLOY_ARCH STREQUAL PULP) target_compile_options(network PRIVATE -Wno-pointer-sign) endif() - if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka) + if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule) add_subdirectory(Platforms/Siracusa) elseif(platform STREQUAL PULPOpen) add_subdirectory(Platforms/PULPOpen) diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py index f29891bf..4b05bd59 100644 --- a/DeeployTest/conftest.py +++ b/DeeployTest/conftest.py @@ -66,6 +66,8 @@ def pytest_configure(config: pytest.Config) -> None: config.addinivalue_line("markers", "siracusa_tiled: mark test as a Siracusa platform test (tiled)") config.addinivalue_line("markers", "siracusa_neureka_tiled: mark test as a Siracusa + Neureka platform test (tiled)") + config.addinivalue_line("markers", + "siracusa_redmule_tiled: mark test as a Siracusa + RedMulE platform test (tiled)") config.addinivalue_line("markers", "gap9: mark test as a GAP9 platform test") config.addinivalue_line("markers", "gap9_tiled: mark test as a GAP9 platform test (tiled)") config.addinivalue_line("markers", "kernels: mark test as a kernel test (individual operators)") diff --git a/DeeployTest/testRunner_tiled_siracusa_w_redmule.py b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py new file mode 100644 index 00000000..9ebd9c63 --- /dev/null +++ b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py @@ -0,0 +1,49 @@ +# ---------------------------------------------------------------------- +# +# File: testRunner_tiled_siracusa_w_neureka.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from testUtils.testRunner import TestRunner, TestRunnerArgumentParser + +if __name__ == "__main__": + + parser = TestRunnerArgumentParser( + tiling_arguments = True, + description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling & Redmule).") + + parser.add_argument('--cores', + metavar = '', + dest = 'cores', + type = int, + default = 1, + help = 'Set number of cluster cores') + args = parser.parse_args() + + testRunner = TestRunner(platform = "Siracusa_w_redmule", + simulator = "gvsoc", + tiling = True, + argument_parser = parser) + + testRunner.cmake_args += f" -D NUM_CORES={args.cores}" + + testRunner.run() diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 9d526906..32c06c95 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -25,13 +25,17 @@ NeurekaPlatform from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform +from Deeploy.Targets.Redmule.Deployer import RedmuleDeployer +from Deeploy.Targets.Redmule.Platform import RedmuleOptimizer, RedmulePlatform from Deeploy.Targets.Snitch.Deployer import SnitchDeployer from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] -_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"] +_NONSIGNPROP_PLATFORMS = [ + "Siracusa", "Siracusa_w_neureka", "Siracusa_w_redmule", "PULPOpen", "Snitch", "Chimera", "GAP9" +] _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS @@ -67,6 +71,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Siracusa_w_neureka": Platform = NeurekaPlatform() + elif platformName == "Siracusa_w_redmule": + Platform = RedmulePlatform() + elif platformName == "Snitch": Platform = SnitchPlatform() @@ -84,7 +91,7 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: def setupMemoryPlatform(platform: DeploymentPlatform, memoryHierarchy: MemoryHierarchy, defaultTargetMemoryLevel: MemoryLevel) -> Union[MemoryPlatform, MemoryPlatformWrapper]: - if isinstance(platform, PULPPlatform): + if isinstance(platform, (PULPPlatform, RedmulePlatform)): return MemoryPULPPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel) elif isinstance(platform, NeurekaPlatform): weightMemoryLevel = memoryHierarchy.memoryLevels["WeightMemory_SRAM"] \ @@ -207,6 +214,24 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) + elif isinstance(platform, RedmulePlatform): + + if loweringOptimizer is None: + loweringOptimizer = RedmuleOptimizer + + if default_channels_first is None: + default_channels_first = False + + deployer = RedmuleDeployer(graph, + platform, + inputTypes, + loweringOptimizer, + scheduler, + name = name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir, + inputOffsets = inputOffsets) + elif isinstance(platform, (GAP9Platform, MemoryGAP9Platform, MemoryGAP9PlatformWrapper)): if loweringOptimizer is None: diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 7eee2085..83cdb131 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -35,6 +35,12 @@ from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS as NEUREKA_L3_DOUBLEBUFFER_MODELS from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS_WMEM as NEUREKA_L3_DOUBLEBUFFER_MODELS_WMEM from test_siracusa_neureka_tiled_config import L3_SINGLEBUFFER_MODELS as NEUREKA_L3_SINGLEBUFFER_MODELS +from test_siracusa_redmule_tiled_config import DEFAULT_CORES as REDMULE_DEFAULT_CORES +from test_siracusa_redmule_tiled_config import L2_DOUBLEBUFFER_KERNELS as REDMULE_L2_DOUBLEBUFFER_KERNELS +from test_siracusa_redmule_tiled_config import L2_SINGLEBUFFER_KERNELS as REDMULE_L2_SINGLEBUFFER_KERNELS +from test_siracusa_redmule_tiled_config import \ + L3_SINGLEBUFFER_TRAINING_MODELS as REDMULE_L3_SINGLEBUFFER_TRAINING_MODELS +from test_siracusa_redmule_tiled_config import TRAINING_MODEL_OVERRIDES as REDMULE_TRAINING_MODEL_OVERRIDES from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS, L2_SINGLEBUFFER_KERNELS, \ L2_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS @@ -1100,3 +1106,95 @@ def test_gap9_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolch double_buffer = True, ) run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.siracusa_redmule_tiled +@pytest.mark.kernels +@pytest.mark.singlebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(REDMULE_L2_SINGLEBUFFER_KERNELS, "L2-singlebuffer"), + ids = param_id, +) +def test_siracusa_redmule_tiled_kernels_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, + cmake_args, skipgen, skipsim) -> None: + test_name, l1, config_name = test_params + config = create_test_config( + test_name = test_name, + platform = "Siracusa_w_redmule", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = True, + cores = REDMULE_DEFAULT_CORES, + l1 = l1, + default_mem_level = "L2", + double_buffer = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.siracusa_redmule_tiled +@pytest.mark.kernels +@pytest.mark.doublebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(REDMULE_L2_DOUBLEBUFFER_KERNELS, "L2-doublebuffer"), + ids = param_id, +) +def test_siracusa_redmule_tiled_kernels_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, + cmake_args, skipgen, skipsim) -> None: + test_name, l1, config_name = test_params + config = create_test_config( + test_name = test_name, + platform = "Siracusa_w_redmule", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = True, + cores = REDMULE_DEFAULT_CORES, + l1 = l1, + default_mem_level = "L2", + double_buffer = True, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.siracusa_redmule_tiled +@pytest.mark.training +@pytest.mark.singlebuffer +@pytest.mark.l3 +@pytest.mark.parametrize( + "test_params", + generate_test_params(REDMULE_L3_SINGLEBUFFER_TRAINING_MODELS, "L3-singlebuffer-training"), + ids = param_id, +) +def test_siracusa_redmule_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, + cmake_args, skipgen, skipsim) -> None: + test_name, l1, _config_name = test_params + overrides = REDMULE_TRAINING_MODEL_OVERRIDES.get(test_name, {}) + config = create_test_config( + test_name = test_name, + platform = "Siracusa_w_redmule", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = True, + cores = REDMULE_DEFAULT_CORES, + l1 = l1, + l2 = 2000000, + default_mem_level = "L3", + double_buffer = False, + training = True, + training_num_data_inputs = overrides.get("num_data_inputs"), + training_tolerance = overrides.get("tolerance"), + ) + run_and_assert_test(test_name, config, skipgen, skipsim) diff --git a/DeeployTest/test_siracusa_redmule_tiled_config.py b/DeeployTest/test_siracusa_redmule_tiled_config.py new file mode 100644 index 00000000..2001513c --- /dev/null +++ b/DeeployTest/test_siracusa_redmule_tiled_config.py @@ -0,0 +1,51 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Test configuration for Siracusa platform with RedMulE accelerator (tiled).""" + +# Siracusa + RedMulE platform with tiling support +# Default configuration: 8 cores, gvsoc simulator + +DEFAULT_CORES = 8 + +# L2 single-buffer kernel tests +# Format: dict of {test_name: [L1_sizes]} +L2_SINGLEBUFFER_KERNELS = { + "Kernels/FP32/GEMM/Regular": [8000], + "Kernels/FP32/GEMM/TransB": [8000], + # Pointwise (1x1) ConvGrad fixtures from the MobileNet / ResNet8 backward + # paths. Both bind to RedMulE via the PWConvGrad{W,X}2DRedmuleMapper + # inserted into PULPCluster's ConvGrad{W,X}Layer in + # RedmulePlatform.__init__. L1=8000 mirrors the GEMM kernel budget. + "Kernels/FP32/ConvGradW_PW": [8000], + "Kernels/FP32/ConvGradX_PW_block_11": [8000], +} + +# L2 double-buffer kernel tests +L2_DOUBLEBUFFER_KERNELS = { + "Kernels/FP32/GEMM/Regular": [8000], +} + +# L3 single-buffer training models. Pared down to just CCT for now: the +# new PWConvGrad{W,X} RedMulE kernels are primarily validated via the +# kernel-test matrix above (Kernels/FP32/ConvGradW_PW + +# Kernels/FP32/ConvGradX_PW_block_11) which uses deterministic ORT-computed +# references. A fully-empty dict here would make +# `@pytest.mark.parametrize` error out at collection time with +# "error raised while trying to determine id of parameter 'test_params' at +# position 0", blocking the kernel jobs that share the same test module -- +# so we keep CCT as a minimum (smallest of the three). Re-add ResNet8 and +# MobileNetV1 once the new W kernel's tiler interaction is confirmed. +L3_SINGLEBUFFER_TRAINING_MODELS = { + "Models/Training/CCT/cct_train": [128000], +} + +# Match the per-model overrides used in test_siracusa_tiled_config so the +# RedMulE training run inherits the same num_data_inputs and tolerance +# (CCT step-0 forward drift ~1.5e-3, see comment in that file). +TRAINING_MODEL_OVERRIDES = { + "Models/Training/CCT/cct_train": { + "num_data_inputs": 1, + "tolerance": 5e-3, + }, +} diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index bafa6635..a4ad2935 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -20,6 +20,10 @@ "Kernels/FP32/Conv/Regular_2D_NoBias": [1600], "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [6600], "Kernels/FP32/GEMM/Regular": [8000], + # PW ConvGrad baselines so the RedMulE-side speedup table has matching + # PULP numbers to diff against in the CI summary. + "Kernels/FP32/ConvGradW_PW": [8000], + "Kernels/FP32/ConvGradX_PW_block_11": [8000], "Kernels/FP32/MatMul": [2000], "Kernels/FP32/MaxPool/Regular_2D": [2000], "Kernels/FP32/Mul": [2000], diff --git a/Makefile b/Makefile index f007f105..423c3b8d 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,7 @@ PULP_SDK_COMMIT_HASH ?= 7f4f22516157a1b7c55bcbbc72ca81326180b3b4 MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6 SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2 SOFTHIER_COMMIT_HASH ?= 0 # bowwang: to be updated -GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0 +GVSOC_COMMIT_HASH ?= 35d00d15d7249daaac0de61bd8485fba128e5959 MINIMALLOC_COMMMIT_HASH ?= e9eaf54094025e1c246f9ec231b905f8ef42a29d CHIMERA_SDK_COMMIT_HASH ?= b2392f6efcff75c03f4c65eaf3e12104442b22ea XTL_VERSION ?= 0.7.5 @@ -465,7 +465,7 @@ snitch_runtime: ${SNITCH_INSTALL_DIR} ${TOOLCHAIN_DIR}/gvsoc: cd ${TOOLCHAIN_DIR} && \ - git clone https://github.com/gvsoc/gvsoc.git && \ + git clone https://github.com/runwangdl/gvsoc.git && \ cd ${TOOLCHAIN_DIR}/gvsoc && git checkout ${GVSOC_COMMIT_HASH} && \ git submodule update --init --recursive && \ pip install -r core/requirements.txt && pip install -r gapy/requirements.txt diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt index ce39fea7..d8db78be 100644 --- a/TargetLibraries/PULPOpen/CMakeLists.txt +++ b/TargetLibraries/PULPOpen/CMakeLists.txt @@ -10,7 +10,7 @@ if(NOT DEFINED ENV{PULP_SDK_HOME}) message(FATAL_ERROR "Environment variable PULP_SDK_HOME not set.") endif() -if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka") +if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka" OR platform STREQUAL "Siracusa_w_redmule") include(cmake/pulp-sdk-siracusa.cmake) elseif(platform STREQUAL "PULPOpen") include(cmake/pulp-sdk-pulp-open.cmake) diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h index 7eff2b1f..43d33593 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h +++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h @@ -26,6 +26,19 @@ void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC( uint32_t pad_left, uint32_t pad_right, float32_t *__restrict__ pContextBuffer); +// RedMulE-accelerated FP32 Conv2d. Expects weight already permuted from the +// ONNX [F, P, Q, C] layout to [P, Q, C, F] (a flat [P*Q*C, F] matrix); +// RedMuleAdjustWeightMemoryLayoutPass handles that. pIm2ColBuf must hold +// H_out * W_out * (C*P*Q) FP32 elements; its size is reserved by +// RedmuleFloatConvIm2ColTemplate.computeTransientBuffersSize. +void Conv2d_Im2Col_fp32_fp32_fp32_HWC_8_Redmule( + const float32_t *__restrict__ pIn, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pWeight, uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, const float32_t *__restrict__ pBias, const bool has_bias, + float32_t *__restrict__ pOut, uint32_t F, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right, + float32_t *__restrict__ pIm2ColBuf); + void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC( const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C, const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P, @@ -93,6 +106,27 @@ void PULP_PWConvGradW2d_fp32_fp32_fp32_CHW( uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in, uint32_t W_in, uint32_t C_in, float *__restrict__ pGradWeight); +// RedMulE-accelerated pointwise (1x1) Conv backward weight gradient. +// Same arg order as PULP_PWConvGradW2d_fp32_fp32_fp32_CHW plus a +// pTransposeBuffer of C_in * H_in * W_in FP32 elements (reserved by +// RedmulePWConvGradW2DTemplate.computeTransientBuffersSize) used to +// materialise X^T before firing one RedMulE GEMM. +void PWConvGradW2d_fp32_fp32_fp32_CHW_Redmule( + const float32_t *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float32_t *__restrict__ pInput, uint32_t H_in, + uint32_t W_in, uint32_t C_in, float32_t *__restrict__ pGradWeight, + float32_t *__restrict__ pTransposeBuffer); + +// RedMulE-accelerated pointwise (1x1) Conv backward input gradient. +// Mirrors PULP_PWConvGradX2d_fp32_fp32_fp32_CHW signature; the C_in*C_out +// transpose buffer is reused for W^T before firing one RedMulE GEMM. +void PWConvGradX2d_fp32_fp32_fp32_CHW_Redmule( + const float32_t *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float32_t *__restrict__ pWeight, uint32_t C_in, + float32_t *__restrict__ pGradIn, uint32_t H_in, uint32_t W_in, + float32_t *__restrict__ pTransposeBuffer, + uint32_t transposeBufferSize); + void PULP_PWConvGradX2d_fp32_fp32_fp32_CHW( const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, uint32_t C_out, const float *__restrict__ pWeight, uint32_t C_in, diff --git a/TargetLibraries/PULPOpen/src/Conv2d_Im2Col_fp32_Redmule.c b/TargetLibraries/PULPOpen/src/Conv2d_Im2Col_fp32_Redmule.c new file mode 100644 index 00000000..b5b91235 --- /dev/null +++ b/TargetLibraries/PULPOpen/src/Conv2d_Im2Col_fp32_Redmule.c @@ -0,0 +1,142 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#include "DeeployPULPMath.h" +#include "pmsis.h" + +// RedMulE matmul kernels live in Matmul_fp32_Redmule.c and have no header +// of their own; forward-declare the two we need rather than adding a +// cross-file include. +extern void MatMul_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, + uint32_t M, uint32_t N, uint32_t O); +extern void Gemm_fp32_fp32_fp32_fp32_Redmule( + const float32_t *__restrict__ pSrcA, const float32_t *__restrict__ pSrcB, + const float32_t *__restrict__ pBias, float32_t *__restrict__ pDstY, + uint32_t M, uint32_t N, uint32_t O); + +// Chunk size for the streaming im2col + RedMulE pipeline. Chosen to be 16 +// because RedMulE's FP32 mode wants M divisible by 16 for full 4x12-array +// utilisation, and 16 rows × K columns fits comfortably in L1 for any K we +// reasonably expect from a Conv layer (e.g. C·P·Q = 576 for a 3x3 Conv with +// 64 input channels -> 16*576*4 = 36 KiB). The transient buffer hoisted by +// RedmuleFloatConvIm2ColTemplate.computeTransientBuffersSize is sized to +// exactly this many rows. +#define IM2COL_CHUNK_ROWS 16 + +// Layout assumptions: +// pIn : input in HWC, shape [H, W, C] +// pWeight : weight after RedMuleAdjustWeightMemoryLayoutPass, which +// transposes the ONNX [F, P, Q, C] weight into [P, Q, C, F]. +// In a flat im2col-style view that is a [P*Q*C, F] matrix, +// i.e. exactly the right operand of (im2col @ W). +// pOut : output in HWC, shape [H_out, W_out, F] +// pBias : optional bias of shape [F], broadcast across all output +// positions when has_bias is true. +// pIm2ColBuf: transient L1 scratch of size IM2COL_CHUNK_ROWS * (C*P*Q) +// floats, hoisted by ConvTemplate.computeTransientBuffersSize. +// +// Compute (streaming): +// For each chunk of IM2COL_CHUNK_ROWS output positions: +// 1. All cluster cores cooperatively build the chunk's im2col rows +// into pIm2ColBuf (zero-pad when h_in/w_in fall outside the input). +// 2. Cluster barrier. +// 3. Master core triggers one RedMulE GEMM: +// [chunk_rows, K] @ [K, F] -> [chunk_rows, F] +// written directly into the corresponding stripe of pOut. When +// has_bias is set, the [F] bias is broadcast into that stripe +// first and then Gemm is called with y_addr = z_addr = stripe +// (same y=z aliasing pattern Matmul_fp32_Redmule already uses). +// 4. Cluster barrier. +// +// Streaming was chosen over whole-image im2col because larger Conv layers +// (e.g. ResNet8 middle layers with H_out*W_out ≥ 1024) would otherwise +// blow the L1 budget: a 1024-row im2col with K=144 is 576 KiB, far above +// the 128 KiB L1 tile budget. 16 rows per chunk costs a few extra RedMulE +// triggers (~200 cycles each) but lets the tiler keep working at any +// reasonable Conv size. +void Conv2d_Im2Col_fp32_fp32_fp32_HWC_8_Redmule( + const float32_t *__restrict__ pIn, uint32_t H, uint32_t W, uint32_t C, + const float32_t *__restrict__ pWeight, uint32_t P, uint32_t Q, uint32_t SP, + uint32_t SQ, const float32_t *__restrict__ pBias, const bool has_bias, + float32_t *__restrict__ pOut, uint32_t F, uint32_t pad_top, + uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right, + float32_t *__restrict__ pIm2ColBuf) { + + const int8_t core_id = pi_core_id(); + + const uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1; + const uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1; + const uint32_t N_out = H_out * W_out; + const uint32_t K = C * P * Q; + + for (uint32_t row_start = 0; row_start < N_out; + row_start += IM2COL_CHUNK_ROWS) { + const uint32_t this_chunk = + ((N_out - row_start) < IM2COL_CHUNK_ROWS) ? (N_out - row_start) + : IM2COL_CHUNK_ROWS; + + // ---- 1. Parallel im2col over this chunk's rows ---------------------- + // Each core fills a contiguous slice of the chunk; with CHUNK_ROWS=16 + // and NUM_CORES=8, every core handles exactly 2 rows when the chunk is + // full. A short tail chunk (e.g. last 5 rows) leaves the higher-numbered + // cores idle. + const uint32_t local_chunk = + (this_chunk + NUM_CORES - 1) / NUM_CORES; + const uint32_t local_start = + ((uint32_t)core_id * local_chunk < this_chunk) + ? ((uint32_t)core_id * local_chunk) + : this_chunk; + const uint32_t local_end = ((local_start + local_chunk) < this_chunk) + ? (local_start + local_chunk) + : this_chunk; + + for (uint32_t r = local_start; r < local_end; ++r) { + const uint32_t pos = row_start + r; + const uint32_t h_out = pos / W_out; + const uint32_t w_out = pos % W_out; + float32_t *row = pIm2ColBuf + r * K; + uint32_t k = 0; + for (uint32_t p = 0; p < P; ++p) { + const int32_t h_in = (int32_t)(h_out * SP + p) - (int32_t)pad_top; + const bool h_in_range = (h_in >= 0) && (h_in < (int32_t)H); + for (uint32_t q = 0; q < Q; ++q) { + const int32_t w_in = (int32_t)(w_out * SQ + q) - (int32_t)pad_left; + if (h_in_range && (w_in >= 0) && (w_in < (int32_t)W)) { + const uint32_t in_base = ((uint32_t)h_in * W + (uint32_t)w_in) * C; + for (uint32_t c = 0; c < C; ++c) { + row[k++] = pIn[in_base + c]; + } + } else { + for (uint32_t c = 0; c < C; ++c) { + row[k++] = 0.0f; + } + } + } + } + } + + pi_cl_team_barrier(0); + + // ---- 2. RedMulE GEMM for this chunk's output stripe ----------------- + if (core_id == 0) { + float32_t *out_stripe = pOut + row_start * F; + if (has_bias) { + for (uint32_t i = 0; i < this_chunk; ++i) { + for (uint32_t f = 0; f < F; ++f) { + out_stripe[i * F + f] = pBias[f]; + } + } + Gemm_fp32_fp32_fp32_fp32_Redmule(pIm2ColBuf, pWeight, out_stripe, + out_stripe, this_chunk, K, F); + } else { + MatMul_fp32_fp32_fp32_Redmule(pIm2ColBuf, pWeight, out_stripe, + this_chunk, K, F); + } + } + + pi_cl_team_barrier(0); + } +} diff --git a/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c new file mode 100644 index 00000000..ad33b66b --- /dev/null +++ b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c @@ -0,0 +1,160 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#include "DeeployBasicMath.h" + +#define REDMULE_BASE_ADDR 0x10201C00 + +#define REG_MNK_M 0x00 +#define REG_MNK_N 0x04 +#define REG_MNK_K 0x08 +#define REG_X_ADDR 0x0C +#define REG_Y_ADDR 0x10 +#define REG_Z_ADDR 0x14 +#define REG_W_ADDR 0x18 +#define REG_COMPUTE_MODE 0x1C +#define REG_TRIGGER 0x20 +#define REG_WAIT 0x28 + +void MatMul_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t O) { + + uint32_t total_elements = M * O; + for (uint32_t i = 0; i < total_elements; i++) { + pDstY[i] = 0.0f; + } + + volatile uint16_t *mnk_m = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M); + volatile uint16_t *mnk_n = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N); + volatile uint16_t *mnk_k = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K); + + *mnk_m = (uint16_t)M; + *mnk_n = (uint16_t)N; + *mnk_k = (uint16_t)O; + + volatile uint32_t *x_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR); + volatile uint32_t *y_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR); + volatile uint32_t *z_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR); + volatile uint32_t *w_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR); + + *x_addr = (uint32_t)((uintptr_t)pSrcA); + *y_addr = (uint32_t)((uintptr_t)pDstY); + *z_addr = (uint32_t)((uintptr_t)pDstY); + *w_addr = (uint32_t)((uintptr_t)pSrcB); + + volatile uint32_t *compute_mode = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE); + *compute_mode = 4; // FP32 mode + + volatile uint32_t *trigger = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER); + *trigger; + + volatile uint32_t *wait_reg = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT); + uint32_t result = *wait_reg; +} + +void MatMul_fp32_fp32_fp32_Redmule_Async(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, + uint32_t M, uint32_t N, uint32_t O) { + + uint32_t total_elements = M * O; + for (uint32_t i = 0; i < total_elements; i++) { + pDstY[i] = 0.0f; + } + + volatile uint16_t *mnk_m = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M); + volatile uint16_t *mnk_n = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N); + volatile uint16_t *mnk_k = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K); + + *mnk_m = (uint16_t)M; + *mnk_n = (uint16_t)N; + *mnk_k = (uint16_t)O; + + volatile uint32_t *x_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR); + volatile uint32_t *y_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR); + volatile uint32_t *z_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR); + volatile uint32_t *w_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR); + + *x_addr = (uint32_t)((uintptr_t)pSrcA); + *y_addr = (uint32_t)((uintptr_t)pDstY); + *z_addr = (uint32_t)((uintptr_t)pDstY); + *w_addr = (uint32_t)((uintptr_t)pSrcB); + + volatile uint32_t *compute_mode = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE); + *compute_mode = 4; // FP32 mode + + volatile uint32_t *trigger = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER); + *trigger; // Trigger without waiting +} + +uint32_t MatMul_fp32_fp32_fp32_Redmule_Wait() { + volatile uint32_t *wait_reg = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT); + return *wait_reg; +} + +void Gemm_fp32_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + const float32_t *__restrict__ pBias, + float32_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t O) { + + volatile uint16_t *mnk_m = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M); + volatile uint16_t *mnk_n = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N); + volatile uint16_t *mnk_k = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K); + + *mnk_m = (uint16_t)M; + *mnk_n = (uint16_t)N; + *mnk_k = (uint16_t)O; + + volatile uint32_t *x_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR); + volatile uint32_t *y_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR); + volatile uint32_t *z_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR); + volatile uint32_t *w_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR); + + *x_addr = (uint32_t)((uintptr_t)pSrcA); + *y_addr = (uint32_t)((uintptr_t)pBias); + *z_addr = (uint32_t)((uintptr_t)pDstY); + *w_addr = (uint32_t)((uintptr_t)pSrcB); + + volatile uint32_t *compute_mode = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE); + *compute_mode = 4; // FP32 mode + + volatile uint32_t *trigger = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER); + *trigger; + + volatile uint32_t *wait_reg = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT); + uint32_t result = *wait_reg; +} diff --git a/TargetLibraries/PULPOpen/src/PWConvGrad_fp32_Redmule.c b/TargetLibraries/PULPOpen/src/PWConvGrad_fp32_Redmule.c new file mode 100644 index 00000000..e1945e38 --- /dev/null +++ b/TargetLibraries/PULPOpen/src/PWConvGrad_fp32_Redmule.c @@ -0,0 +1,227 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#include "DeeployPULPMath.h" +#include "pmsis.h" + +extern void MatMul_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, + uint32_t M, uint32_t N, uint32_t O); +extern void Gemm_fp32_fp32_fp32_fp32_Redmule( + const float32_t *__restrict__ pSrcA, const float32_t *__restrict__ pSrcB, + const float32_t *__restrict__ pBias, float32_t *__restrict__ pDstY, + uint32_t M, uint32_t N, uint32_t O); + +// Chunk over P = H_out * W_out positions to keep the L1 transient buffer +// fixed-small regardless of the network's feature-map area. Each chunk +// runs one RedMulE call; chunk-to-chunk accumulation rides on Gemm's +// y_addr = bias = previous dW pattern (same trick the MatMul driver uses +// for its Y=Z=pDstY zero-init). +#define PWGW_CHUNK_P 16 + +// Pointwise (1x1) Conv backward weight gradient, RedMulE-accelerated. +// +// Forward (1x1, stride (SP, SQ)): +// Y[F, h_out, w_out] = sum_c X[c, h_out * SP, w_out * SQ] * W[F, c, 0, 0] +// Backward dW: +// dW[F, C] = sum_{n, h_out, w_out} dY[F, h_out, w_out] +// * X[C, h_out * SP, w_out * SQ] +// +// Mathematically dW = dY_reshape[F, P] @ X_sampled^T[P, C] with +// P = H_out * W_out. A full P-row transpose buffer doesn't scale -- early +// MobileNet blocks would need a 32 * 48 * 48 = 72 KiB buffer and the +// pattern-memory solver runs out of L1 budget. Instead, sample+transpose +// PWGW_CHUNK_P rows at a time and accumulate into dW via Gemm: +// dW = dY_chunk[F, chunk_size] @ X_chunk^T[chunk_size, C] + dW_prev +// The buffer size is fixed at PWGW_CHUNK_P * C_in floats regardless of P, +// at the cost of one extra RedMulE trigger per chunk (~200 cycles each). +// +// Stride is recovered from the input/output spatial ratios so the kernel +// signature stays compatible with the pulp-trainlib variant. +void PWConvGradW2d_fp32_fp32_fp32_CHW_Redmule( + const float32_t *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float32_t *__restrict__ pInput, uint32_t H_in, + uint32_t W_in, uint32_t C_in, float32_t *__restrict__ pGradWeight, + float32_t *__restrict__ pTransposeBuffer) { + + const int8_t core_id = pi_core_id(); + const uint32_t SP = (H_out > 0) ? (H_in / H_out) : 1; + const uint32_t SQ = (W_out > 0) ? (W_in / W_out) : 1; + const uint32_t P = H_out * W_out; + + // Initialise dW to zero so the first chunk's Gemm-with-bias starts from + // a clean slate. Done in parallel across cores. + const uint32_t dw_total = C_out * C_in; + const uint32_t dw_chunk = (dw_total + NUM_CORES - 1) / NUM_CORES; + const uint32_t dw_lo = MIN((uint32_t)core_id * dw_chunk, dw_total); + const uint32_t dw_hi = MIN(dw_lo + dw_chunk, dw_total); + for (uint32_t i = dw_lo; i < dw_hi; ++i) { + pGradWeight[i] = 0.0f; + } + pi_cl_team_barrier(0); + + for (uint32_t chunk_start = 0; chunk_start < P; chunk_start += PWGW_CHUNK_P) { + const uint32_t this_chunk = + ((P - chunk_start) < PWGW_CHUNK_P) ? (P - chunk_start) : PWGW_CHUNK_P; + + // ---- 1. Parallel sampled-transpose of this chunk's X positions ----- + // pTransposeBuffer[k_local * C_in + c] = X[c, h_in, w_in] + const uint32_t total = this_chunk * C_in; + const uint32_t chunk_w = (total + NUM_CORES - 1) / NUM_CORES; + const uint32_t lo = MIN((uint32_t)core_id * chunk_w, total); + const uint32_t hi = MIN(lo + chunk_w, total); + + for (uint32_t idx = lo; idx < hi; ++idx) { + const uint32_t k_local = idx / C_in; + const uint32_t c = idx % C_in; + const uint32_t k = chunk_start + k_local; + const uint32_t h_out = k / W_out; + const uint32_t w_out = k % W_out; + const uint32_t h_in = h_out * SP; + const uint32_t w_in = w_out * SQ; + pTransposeBuffer[idx] = pInput[c * (H_in * W_in) + h_in * W_in + w_in]; + } + + pi_cl_team_barrier(0); + + // ---- 2. RedMulE Gemm: dW = dY_chunk @ X_chunk^T + dW_prev --------- + // dY_chunk is a contiguous stripe of dY along its inner spatial + // axis. Since dY is laid out as [C_out, P] = [F, P] in CHW, the + // F-th row's slice [chunk_start : chunk_start + this_chunk] is at + // pGradOut + f * P + chunk_start -- NOT contiguous across F. + // For RedMulE to consume the chunk as [F, this_chunk] it needs to + // be contiguous, which here means we treat dY[F, P] as the left + // operand and pass chunk-stride math via N=this_chunk only when + // chunk_start == 0 AND this_chunk == P (i.e. P fits in one + // RedMulE call). When chunks are smaller we must build a + // contiguous [F, this_chunk] view too -- skipped here because the + // transient already lives in a fixed slot; instead we pass the + // *full* dY [F, P] and X^T padded to P rows. See follow-up note. + // + // The simplest correct path used below is the single-chunk case + // (PWGW_CHUNK_P >= P), which holds whenever the tiler shrinks + // the spatial output to <= 16 positions per tile. Otherwise we + // would need a dY scratch too; flagged for the next iteration. + if (this_chunk == P) { + if (core_id == 0) { + Gemm_fp32_fp32_fp32_fp32_Redmule(pGradOut, pTransposeBuffer, + pGradWeight, pGradWeight, C_out, + this_chunk, C_in); + } + } else { + // Multi-chunk path: gather a contiguous [F, this_chunk] view of dY + // into the tail of pTransposeBuffer. The template reserves enough + // headroom (see RedmulePWConvGradWTemplate.computeTransientBuffersSize). + float32_t *dY_view = pTransposeBuffer + (PWGW_CHUNK_P * C_in); + const uint32_t dy_total = C_out * this_chunk; + const uint32_t dy_chunk = (dy_total + NUM_CORES - 1) / NUM_CORES; + const uint32_t dy_lo = MIN((uint32_t)core_id * dy_chunk, dy_total); + const uint32_t dy_hi = MIN(dy_lo + dy_chunk, dy_total); + for (uint32_t idx = dy_lo; idx < dy_hi; ++idx) { + const uint32_t f = idx / this_chunk; + const uint32_t k_local = idx % this_chunk; + const uint32_t k = chunk_start + k_local; + dY_view[idx] = pGradOut[f * P + k]; + } + pi_cl_team_barrier(0); + if (core_id == 0) { + Gemm_fp32_fp32_fp32_fp32_Redmule(dY_view, pTransposeBuffer, + pGradWeight, pGradWeight, C_out, + this_chunk, C_in); + } + } + + pi_cl_team_barrier(0); + } +} + +// Pointwise (1x1) Conv backward input gradient, RedMulE-accelerated. +// +// Same shape relations as the forward path; stride > 1 means dX has more +// spatial positions than dY and only the strided samples are non-zero. +// +// Pipeline: +// - Zero pGradIn. +// - W^T transpose: pTransposeBuffer[0:C_in*C_out] = W^T. +// - GEMM tmp[C_in, P] = W^T @ dY[C_out, P], P = H_out * W_out. +// For stride 1 we write tmp directly into pGradIn (dX layout matches). +// For stride > 1 we route the GEMM output to the tail of +// pTransposeBuffer and scatter it into pGradIn at strided positions. +// +// Unlike the W kernel, X's GEMM dimensions don't scale with P alone -- +// the K (inner) dim is C_out, which is bounded by the tile's +// channel-tile. So the existing all-in-one-GEMM path remains feasible +// and we keep it; only the transient buffer changed shape (size cap +// reflected in RedmulePWConvGradXTemplate). +void PWConvGradX2d_fp32_fp32_fp32_CHW_Redmule( + const float32_t *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out, + uint32_t C_out, const float32_t *__restrict__ pWeight, uint32_t C_in, + float32_t *__restrict__ pGradIn, uint32_t H_in, uint32_t W_in, + float32_t *__restrict__ pTransposeBuffer, + uint32_t transposeBufferSize) { + + (void)transposeBufferSize; + + const int8_t core_id = pi_core_id(); + const uint32_t SP = (H_out > 0) ? (H_in / H_out) : 1; + const uint32_t SQ = (W_out > 0) ? (W_in / W_out) : 1; + const uint32_t P = H_out * W_out; + const bool strided = (SP != 1) || (SQ != 1); + + // ---- 1. Zero pGradIn (parallel) --------------------------------------- + const uint32_t dx_total = C_in * H_in * W_in; + const uint32_t dx_chunk = (dx_total + NUM_CORES - 1) / NUM_CORES; + const uint32_t dx_lo = MIN((uint32_t)core_id * dx_chunk, dx_total); + const uint32_t dx_hi = MIN(dx_lo + dx_chunk, dx_total); + for (uint32_t i = dx_lo; i < dx_hi; ++i) { + pGradIn[i] = 0.0f; + } + + // ---- 2. Parallel transpose W[C_out, C_in] -> W^T[C_in, C_out] -------- + const uint32_t wt_total = C_in * C_out; + const uint32_t wt_chunk = (wt_total + NUM_CORES - 1) / NUM_CORES; + const uint32_t wt_lo = MIN((uint32_t)core_id * wt_chunk, wt_total); + const uint32_t wt_hi = MIN(wt_lo + wt_chunk, wt_total); + for (uint32_t idx = wt_lo; idx < wt_hi; ++idx) { + const uint32_t c_in = idx / C_out; + const uint32_t c_out = idx % C_out; + pTransposeBuffer[idx] = pWeight[c_out * C_in + c_in]; + } + + pi_cl_team_barrier(0); + + // ---- 3. RedMulE GEMM: dX_dense[C_in, P] = W^T[C_in, C_out] @ dY[C_out, P] - + if (core_id == 0) { + if (!strided) { + MatMul_fp32_fp32_fp32_Redmule(pTransposeBuffer, pGradOut, pGradIn, + C_in, C_out, P); + } else { + float32_t *tmp_gemm = pTransposeBuffer + (C_in * C_out); + MatMul_fp32_fp32_fp32_Redmule(pTransposeBuffer, pGradOut, tmp_gemm, + C_in, C_out, P); + } + } + + pi_cl_team_barrier(0); + + // ---- 4. Scatter (stride > 1 only) ------------------------------------ + if (strided) { + float32_t *tmp_gemm = pTransposeBuffer + (C_in * C_out); + const uint32_t scat_total = C_in * P; + const uint32_t scat_chunk = (scat_total + NUM_CORES - 1) / NUM_CORES; + const uint32_t scat_lo = MIN((uint32_t)core_id * scat_chunk, scat_total); + const uint32_t scat_hi = MIN(scat_lo + scat_chunk, scat_total); + for (uint32_t idx = scat_lo; idx < scat_hi; ++idx) { + const uint32_t c = idx / P; + const uint32_t k = idx % P; + const uint32_t h_out = k / W_out; + const uint32_t w_out = k % W_out; + const uint32_t h_in = h_out * SP; + const uint32_t w_in = w_out * SQ; + pGradIn[c * (H_in * W_in) + h_in * W_in + w_in] = tmp_gemm[idx]; + } + pi_cl_team_barrier(0); + } +}