diff --git a/.github/workflows/_runner-siracusa-redmule-tiled.yml b/.github/workflows/_runner-siracusa-redmule-tiled.yml
new file mode 100644
index 00000000..8bf5265d
--- /dev/null
+++ b/.github/workflows/_runner-siracusa-redmule-tiled.yml
@@ -0,0 +1,161 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _runner-siracusa-redmule-tiled-sequential
+
+"on":
+ workflow_call:
+ inputs:
+ runner:
+ required: true
+ type: string
+ docker-image:
+ required: true
+ type: string
+ pytest-marker:
+ required: true
+ type: string
+ # Extra flags injected into the pytest command, between -v and the -m
+ # marker filter. Default keeps the original 4-worker xdist behavior;
+ # callers that want simulator stdout (e.g. GVSoC cycle counts) in the
+ # CI log can override with "-s -p no:xdist" to disable capture and
+ # the parallel worker plugin (xdist eats per-test stdout).
+ pytest-flags:
+ required: false
+ type: string
+ default: "-n 4"
+
+jobs:
+ test-runner-siracusa-redmule-tiled:
+ runs-on: ${{ inputs.runner }}
+ container:
+ image: ${{ inputs.docker-image }}
+ credentials:
+ username: ${{ github.actor }}
+ password: ${{ secrets.GITHUB_TOKEN }}
+ steps:
+ - name: Mark workspace as safe
+ run: git config --global --add safe.directory '*'
+ - name: Checkout Repo
+ uses: actions/checkout@v4
+ with:
+ submodules: recursive
+ - name: Build Deeploy
+ shell: bash
+ run: pip install -e .
+ - name: Run Test
+ run: |
+ cd DeeployTest
+ mkdir -p /app/.ccache
+ export CCACHE_DIR=/app/.ccache
+ set -o pipefail
+ pytest test_platforms.py -v ${{ inputs.pytest-flags }} -m "siracusa_redmule_tiled and ${{ inputs.pytest-marker }}" 2>&1 | tee /tmp/pytest_out.log
+ shell: bash
+ - name: Report cycle counts (RedMulE side, with speedup vs Siracusa)
+ if: always()
+ shell: bash
+ env:
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+ HEAD_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+ REPO: ${{ github.repository }}
+ MARKER: ${{ inputs.pytest-marker }}
+ run: |
+ python3 - <<'PY'
+ import json, os, re, sys, urllib.request, pathlib
+ LOG_PATH = "/tmp/pytest_out.log"
+ PAT = re.compile(r'^BENCH train_cycles=(\d+) opt_cycles=(\d+) weight_sram=(\d+)')
+
+ if not pathlib.Path(LOG_PATH).exists():
+ print("no pytest log found; skipping")
+ sys.exit(0)
+
+ # 1. parse RedMulE side's BENCH lines (one per training model)
+ rmu = []
+ with open(LOG_PATH) as fh:
+ for line in fh:
+ m = PAT.search(line)
+ if m:
+ rmu.append({
+ 'train': int(m.group(1)), 'opt': int(m.group(2)),
+ 'sram': int(m.group(3))})
+ if not rmu:
+ print("No BENCH line in pytest output (kernel-only job?). Skipping summary.")
+ sys.exit(0)
+
+ out = []
+ marker = os.environ.get('MARKER', '?')
+ sha = os.environ.get('HEAD_SHA', '')[:7]
+ out.append(f"## Siracusa + RedMulE cycles ({marker})")
+ out.append("")
+ out.append("| weight_sram | train_cycles | opt_cycles |")
+ out.append("|---:|---:|---:|")
+ for r in rmu:
+ out.append(f"| {r['sram']:,} | {r['train']:,} | {r['opt']:,} |")
+ out.append("")
+ out.append(f"_Counted on commit `{sha}` via GVSoC._")
+
+ # 2. best-effort: find Siracusa baseline on same SHA, build speedup table
+ repo = os.environ.get('REPO', '')
+ head_sha = os.environ.get('HEAD_SHA', '')
+ tok = os.environ.get('GH_TOKEN', '')
+
+ def gh(url):
+ req = urllib.request.Request(url, headers={'Authorization': f'bearer {tok}'})
+ with urllib.request.urlopen(req, timeout=20) as r:
+ return r.read()
+
+ try:
+ runs = json.loads(gh(
+ f"https://api.github.com/repos/{repo}/actions/runs"
+ f"?head_sha={head_sha}&per_page=30"))
+ base_run_id = next(
+ (r['id'] for r in runs.get('workflow_runs', [])
+ if r['name'] == 'CI • Siracusa (Tiled)' and r['event'] == 'push'),
+ None)
+ if base_run_id is None:
+ out += ["", "_No matching `Siracusa (Tiled)` push run on this SHA — speedup diff skipped._"]
+ else:
+ jobs = json.loads(gh(
+ f"https://api.github.com/repos/{repo}/actions/runs/{base_run_id}/jobs"))
+ base_job_id = next(
+ (j['id'] for j in jobs.get('jobs', [])
+ if 'training' in j['name'].lower()
+ and 'l3' in j['name'].lower()
+ and j.get('conclusion') == 'success'),
+ None)
+ if base_job_id is None:
+ out += ["", "_Siracusa training-L3 baseline job not finished/green yet — speedup diff skipped._"]
+ else:
+ txt = gh(f"https://api.github.com/repos/{repo}/actions/jobs/{base_job_id}/logs").decode('utf-8','replace')
+ base = {}
+ for line in txt.splitlines():
+ m = PAT.search(line)
+ if m:
+ base[int(m.group(3))] = {
+ 'train': int(m.group(1)),
+ 'opt': int(m.group(2))}
+ out += ["", "## Speedup vs Siracusa baseline (matched by weight_sram)", ""]
+ out += ["| weight_sram | Siracusa train | + RedMulE train | sptrain | Siracusa opt | + RedMulE opt | spopt |"]
+ out += ["|---:|---:|---:|:---:|---:|---:|:---:|"]
+ for r in rmu:
+ b = base.get(r['sram'])
+ if b is None:
+ out.append(f"| {r['sram']:,} | — | {r['train']:,} | _no match_ | — | {r['opt']:,} | — |")
+ else:
+ st = b['train'] / r['train']
+ so = b['opt'] / r['opt']
+ out.append(
+ f"| {r['sram']:,} | {b['train']:,} | {r['train']:,} | **{st:.3f}×** "
+ f"| {b['opt']:,} | {r['opt']:,} | **{so:.3f}×** |")
+ except Exception as e:
+ out += ["", f"_Baseline lookup failed: `{type(e).__name__}: {e}` — RedMulE numbers above are still valid._"]
+
+ text = "\n".join(out) + "\n"
+ print(text)
+ sp = os.environ.get('GITHUB_STEP_SUMMARY')
+ if sp:
+ with open(sp, 'a') as f:
+ f.write(text)
+ PY
diff --git a/.github/workflows/_runner-siracusa-tiled.yml b/.github/workflows/_runner-siracusa-tiled.yml
index cc09f234..3e9ecaa1 100644
--- a/.github/workflows/_runner-siracusa-tiled.yml
+++ b/.github/workflows/_runner-siracusa-tiled.yml
@@ -17,6 +17,14 @@ name: _runner-siracusa-tiled
pytest-marker:
required: true
type: string
+ # Extra flags injected into the pytest command (between -v and the -m
+ # marker filter). Default empty preserves the existing sequential
+ # invocation; callers that want simulator stdout (e.g. GVSoC cycle
+ # counts) in the CI log can override with "-s" to disable capture.
+ pytest-flags:
+ required: false
+ type: string
+ default: ""
jobs:
test-runner-siracusa-tiled:
@@ -36,5 +44,28 @@ jobs:
- name: Run Test
run: |
cd DeeployTest
- pytest test_platforms.py -v -m "siracusa_tiled and ${{ inputs.pytest-marker }}"
+ set -o pipefail
+ pytest test_platforms.py -v ${{ inputs.pytest-flags }} -m "siracusa_tiled and ${{ inputs.pytest-marker }}" 2>&1 | tee /tmp/pytest_out.log
+ shell: bash
+ - name: Report cycle counts (Siracusa baseline)
+ if: always()
shell: bash
+ run: |
+ # Emit every BENCH line from the test stdout into the run summary so
+ # the RedMulE-side workflow can diff against these numbers for the
+ # same SHA. Non-training jobs (kernel-only matrices) produce no
+ # BENCH lines and the step is a quiet no-op.
+ if ! grep -q '^BENCH train_cycles=' /tmp/pytest_out.log 2>/dev/null; then
+ echo "No BENCH line found (probably a kernel-only job); skipping summary."
+ exit 0
+ fi
+ echo "## Siracusa baseline training cycles" >> "$GITHUB_STEP_SUMMARY"
+ echo "" >> "$GITHUB_STEP_SUMMARY"
+ echo "| model (weight_sram) | train_cycles | opt_cycles |" >> "$GITHUB_STEP_SUMMARY"
+ echo "|---|---:|---:|" >> "$GITHUB_STEP_SUMMARY"
+ grep '^BENCH train_cycles=' /tmp/pytest_out.log | while read -r line; do
+ tc=$(echo "$line" | sed -nE 's/.*train_cycles=([0-9]+).*/\1/p')
+ oc=$(echo "$line" | sed -nE 's/.*opt_cycles=([0-9]+).*/\1/p')
+ ws=$(echo "$line" | sed -nE 's/.*weight_sram=([0-9]+).*/\1/p')
+ echo "| weight_sram=${ws} | ${tc} | ${oc} |" >> "$GITHUB_STEP_SUMMARY"
+ done
diff --git a/.github/workflows/ci-platform-gap9-tiled.yml b/.github/workflows/ci-platform-gap9-tiled.yml
index 6823344a..44b8d17d 100644
--- a/.github/workflows/ci-platform-gap9-tiled.yml
+++ b/.github/workflows/ci-platform-gap9-tiled.yml
@@ -21,12 +21,16 @@ concurrency:
cancel-in-progress: true
jobs:
+ # GAP9 CI requires access to the private ghcr.io/pulp-platform/deeploy-gap9
+ # image; gate on upstream org so forks skip cleanly.
select-env:
+ if: github.repository_owner == 'pulp-platform'
uses: ./.github/workflows/_select-env.yml
with:
docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/runwangdl/deeploy:gap9' }}
gap9-kernels-tiled-singlebuffer-L2:
+ if: github.repository_owner == 'pulp-platform'
needs: select-env
uses: ./.github/workflows/_runner-gap9-tiled.yml
with:
@@ -35,6 +39,7 @@ jobs:
pytest-markers: "gap9_tiled and kernels and singlebuffer and l2"
gap9-kernels-tiled-doublebuffer-L2:
+ if: github.repository_owner == 'pulp-platform'
needs: select-env
uses: ./.github/workflows/_runner-gap9-tiled.yml
with:
@@ -43,6 +48,7 @@ jobs:
pytest-markers: "gap9_tiled and kernels and doublebuffer and l2"
gap9-models-tiled-singlebuffer-L2:
+ if: github.repository_owner == 'pulp-platform'
needs: select-env
uses: ./.github/workflows/_runner-gap9-tiled.yml
with:
@@ -51,6 +57,7 @@ jobs:
pytest-markers: "gap9_tiled and models and singlebuffer and l2"
gap9-models-tiled-doublebuffer-L2:
+ if: github.repository_owner == 'pulp-platform'
needs: select-env
uses: ./.github/workflows/_runner-gap9-tiled.yml
with:
diff --git a/.github/workflows/ci-platform-gap9.yml b/.github/workflows/ci-platform-gap9.yml
index d3bf829a..e2cf26d3 100644
--- a/.github/workflows/ci-platform-gap9.yml
+++ b/.github/workflows/ci-platform-gap9.yml
@@ -22,12 +22,16 @@ concurrency:
cancel-in-progress: true
jobs:
+ # GAP9 CI requires access to the private ghcr.io/pulp-platform/deeploy-gap9
+ # image; gate on upstream org so forks skip cleanly.
select-env:
+ if: github.repository_owner == 'pulp-platform'
uses: ./.github/workflows/_select-env.yml
with:
docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/runwangdl/deeploy:gap9' }}
gap9-kernels:
+ if: github.repository_owner == 'pulp-platform'
needs: select-env
uses: ./.github/workflows/_runner-gap9.yml
with:
@@ -36,6 +40,7 @@ jobs:
pytest-marker: "kernels"
gap9-models:
+ if: github.repository_owner == 'pulp-platform'
needs: select-env
uses: ./.github/workflows/_runner-gap9.yml
with:
diff --git a/.github/workflows/ci-platform-siracusa-redmule-tiled.yml b/.github/workflows/ci-platform-siracusa-redmule-tiled.yml
new file mode 100644
index 00000000..c0f25e9c
--- /dev/null
+++ b/.github/workflows/ci-platform-siracusa-redmule-tiled.yml
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Siracusa + RedMulE (Tiled)
+
+"on":
+ push:
+ branches:
+ - "**"
+ tags:
+ - "v*.*.*"
+ pull_request:
+ workflow_dispatch:
+ inputs:
+ docker_image_deeploy:
+ description: "Deeploy Image to use"
+ required: false
+ default: "ghcr.io/runwangdl/deeploy:redmule"
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ select-env:
+ uses: ./.github/workflows/_select-env.yml
+ with:
+ # RedMulE CI needs the fork's custom Docker image that bundles a
+ # GVSoC build with the light_redmule model. Fall back to
+ # runwangdl/deeploy:redmule on push/PR events (when no input is
+ # provided) rather than the upstream devel image.
+ docker_image_deeploy: ${{ inputs.docker_image_deeploy || 'ghcr.io/runwangdl/deeploy:redmule' }}
+
+ siracusa-redmule-kernels-tiled-singlebuffer-L2:
+ needs: select-env
+ uses: ./.github/workflows/_runner-siracusa-redmule-tiled.yml
+ with:
+ runner: ${{ needs.select-env.outputs.runner }}
+ docker-image: ${{ needs.select-env.outputs.image }}
+ pytest-marker: "kernels and singlebuffer and l2"
+
+ siracusa-redmule-kernels-tiled-doublebuffer-L2:
+ needs: select-env
+ uses: ./.github/workflows/_runner-siracusa-redmule-tiled.yml
+ with:
+ runner: ${{ needs.select-env.outputs.runner }}
+ docker-image: ${{ needs.select-env.outputs.image }}
+ pytest-marker: "kernels and doublebuffer and l2"
+
+ siracusa-redmule-training-tiled-singlebuffer-L3:
+ needs: select-env
+ uses: ./.github/workflows/_runner-siracusa-redmule-tiled.yml
+ with:
+ runner: ${{ needs.select-env.outputs.runner }}
+ docker-image: ${{ needs.select-env.outputs.image }}
+ pytest-marker: "training and singlebuffer and l3"
+ # Disable pytest's stdout capture so GVSoC's "Cycles" report from the
+ # cct_train simulation lands in the CI log; needs -p no:xdist because
+ # the parallel worker plugin would otherwise re-buffer stdout. Only
+ # one test case in this matrix anyway, so dropping -n 4 is harmless.
+ pytest-flags: "-s -p no:xdist"
diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml
index b65cbb75..69916ee4 100644
--- a/.github/workflows/ci-platform-siracusa-tiled.yml
+++ b/.github/workflows/ci-platform-siracusa-tiled.yml
@@ -46,3 +46,7 @@ jobs:
runner: ${{ needs.select-env.outputs.runner }}
docker-image: ${{ needs.select-env.outputs.image }}
pytest-marker: "training and l3 and singlebuffer"
+ # -s makes GVSoC's per-test "Cycles" report visible in the CI log,
+ # so cct_train cycle counts on plain Siracusa can be diffed against
+ # the Siracusa+RedMulE run for an apples-to-apples speedup number.
+ pytest-flags: "-s"
diff --git a/.github/workflows/infra-generate-documentation.yml b/.github/workflows/infra-generate-documentation.yml
index 84508113..b3036d53 100644
--- a/.github/workflows/infra-generate-documentation.yml
+++ b/.github/workflows/infra-generate-documentation.yml
@@ -28,12 +28,12 @@ jobs:
sphinx-build docs _build
- name: Prepare Multipages
uses: xeratec/gh-pages-multibranch@pr/support_tags
- if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
+ if: ${{ (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && github.repository_owner == 'pulp-platform' }}
with:
directory: _build
- name: Deploy to GitHub Pages
uses: peaceiris/actions-gh-pages@v3
- if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch'}}
+ if: ${{ (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && github.repository_owner == 'pulp-platform' }}
with:
publish_branch: gh-pages
github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e07d64a..3bafd225 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC)
set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
endif()
-set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)")
-set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch)
+set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, Siracusa_w_redmule, PULP-Open, GAP9, Generic, Snitch)")
+set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka Siracusa_w_redmule PULP-Open GAP9 Generic Snitch)
if(platform STREQUAL MemPool)
message(STATUS "Building for platform 'MemPool'")
@@ -31,6 +31,8 @@ elseif(platform STREQUAL Siracusa)
message(STATUS "Building for platform 'Siracusa'")
elseif(platform STREQUAL Siracusa_w_neureka)
message(STATUS "Building for platform 'Siracusa_w_neureka'")
+elseif(platform STREQUAL Siracusa_w_redmule)
+ message(STATUS "Building for platform 'Siracusa_w_redmule'")
elseif(platform STREQUAL PULPOpen)
message(STATUS "Building for platform 'PULP-Open'")
elseif(platform STREQUAL GAP9)
@@ -196,7 +198,7 @@ if(platform STREQUAL QEMU-ARM)
endif()
-if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL PULPOpen)
+if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule OR platform STREQUAL PULPOpen)
if(TOOLCHAIN STREQUAL LLVM)
set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/toolchain_llvm.cmake)
@@ -206,7 +208,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor
include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp.cmake)
- if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka)
+ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/siracusa/siracusa.cmake)
elseif(platform STREQUAL PULPOpen)
include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp-open/pulp-open.cmake)
diff --git a/Deeploy/Targets/Chimera/__init__.py b/Deeploy/Targets/Chimera/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py
index 79770fe6..da553857 100644
--- a/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/TransposeTileConstraint.py
@@ -24,15 +24,32 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
inputBufferName = parseDict['data_in']
outputBufferName = parseDict['data_out']
+ inputShape = ctxt.lookup(inputBufferName).shape
+ outputShape = ctxt.lookup(outputBufferName).shape
+ perm = parseDict["perm"]
+
+ # Spatial-view interpretation of the perm: it operates on the last
+ # len(perm) dims of data_in and the last len(perm) dims of data_out.
+ # MatMulLayer.computeShapes can left-pad the rank of one side without
+ # touching the other when the same gs.Variable is shared between a
+ # broadening (MatMul) and a non-broadening (Gemm/Transpose) consumer,
+ # so the constraint indexing must offset by the per-side leading-batch
+ # depth rather than assume rank == len(perm) == rank_other. When all
+ # ranks already match, offsets are 0 and behavior is unchanged.
+ inputOffset = len(inputShape) - len(perm)
+ outputOffset = len(outputShape) - len(perm)
+ assert inputOffset >= 0 and outputOffset >= 0, (f"Transpose perm {perm} is longer than tensor ranks "
+ f"data_in={inputShape}, data_out={outputShape}")
+
# Add I/O dimensions to the model as variables
for bufferName in [inputBufferName, outputBufferName]:
tilerModel.addTensorDimToModel(ctxt, bufferName)
- # Map output dims to inputs dims
- for idx, perm_idx in enumerate(parseDict["perm"]):
+ # Map output spatial dims to input spatial dims via perm.
+ for idx, perm_idx in enumerate(perm):
tilerModel.addConstraint(
- tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = idx) == tilerModel.getTensorDimVar(
- tensorName = inputBufferName, dimIdx = perm_idx))
+ tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = outputOffset + idx) ==
+ tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = inputOffset + perm_idx))
return tilerModel
@@ -50,7 +67,10 @@ def serializeTilingSolution(
replacementTypes = {}
replacements: Dict[str, List[int]] = {}
- numDims = len(ctxt.lookup(operatorRepresentation['data_in']).shape)
+ # Match the spatial-view interpretation in addGeometricalConstraint:
+ # only the last len(perm) dims of data_in are actually transposed,
+ # so emit exactly len(perm) dimLen_ replacement variables.
+ numDims = len(operatorRepresentation['perm'])
for dim in range(numDims):
replacementTypes[f"dimLen_{dim}"] = PointerClass(uint16_t)
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
index ef046f19..ea0e880a 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
@@ -8,6 +8,8 @@
from Deeploy.CommonExtensions.DataTypes import float32_t
from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+float32_tPtr = PointerClass(float32_t)
+
class PULPFloatGEMMTemplate(NodeTemplate):
diff --git a/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py
index 64143a9d..bf4ca1d2 100644
--- a/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/TransposeTemplate.py
@@ -65,16 +65,27 @@ def alignToContext(self, ctxt: NetworkContext,
fRep['accessStr'] = accessStr
fRep['data_out_shape'] = data_out_shape
- parallelDims = [idx for idx, dim in enumerate(data_out_shape) if dim >= 8]
+ # Spatial-view: perm targets the last len(perm) dims of data_in. When
+ # data_in has been left-padded (e.g. by MatMulLayer.computeShapes
+ # broadening a shared upstream Transpose output), offset the
+ # data_in_shape lookup so dimLen_ reflects the actual
+ # transposed dim rather than a leading batch placeholder. Same
+ # for data_out_shape -- parallelDim must index within the spatial
+ # view since the per-tile for-loop count comes from len(perm).
+ dataInOffset = len(data_in_shape) - len(perm)
+ dataOutOffset = len(data_out_shape) - len(perm)
+ spatialOutShape = list(data_out_shape[dataOutOffset:])
+
+ parallelDims = [idx for idx, dim in enumerate(spatialOutShape) if dim >= 8]
if len(parallelDims) > 0:
parallelDim = parallelDims[0]
else:
- parallelDim = data_out_shape.index(max(data_out_shape))
+ parallelDim = spatialOutShape.index(max(spatialOutShape))
forLoops = []
dimLenPtrs = []
for idx, i in enumerate(perm):
- operatorRepresentation[f"dimLen_{idx}"] = data_in_shape[idx]
+ operatorRepresentation[f"dimLen_{idx}"] = data_in_shape[dataInOffset + idx]
dimLenPtrs.append(f"dimLen_{idx}")
if idx != parallelDim:
forLoops.append(_forLoop.generate({"i": i, "dimLenPtr": f"dimLen_{i}"}))
diff --git a/Deeploy/Targets/Redmule/Bindings.py b/Deeploy/Targets/Redmule/Bindings.py
new file mode 100644
index 00000000..3017f4e8
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Bindings.py
@@ -0,0 +1,66 @@
+# ----------------------------------------------------------------------
+#
+# File: NeurekaBindings.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# Luka Macan, University of Bologna
+# Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t
+from Deeploy.DeeployTypes import NodeBinding
+from Deeploy.Targets.Generic.TypeCheckers import ConvChecker, GEMMChecker, MatMulChecker
+from Deeploy.Targets.PULPOpen.Bindings import ClusterTransformer, ForkTransformer
+from Deeploy.Targets.Redmule.Templates import ConvGradTemplate, ConvTemplate, GEMMTemplate, MatmulTemplate
+
+RedmuleMatmulBindings = [
+ NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+ MatmulTemplate.referenceTemplate, ForkTransformer)
+]
+
+RedmuleConv2DBindings = [
+ NodeBinding(
+ ConvChecker([PointerClass(float32_t), PointerClass(float32_t),
+ PointerClass(float32_t)], [PointerClass(float32_t)]), ConvTemplate.reference2DIm2ColTemplate,
+ ForkTransformer)
+]
+
+RedmuleGEMMBindings = [
+ NodeBinding(
+ GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
+ PointerClass(float32_t)], [PointerClass(float32_t)]), GEMMTemplate.referenceTemplate,
+ ForkTransformer)
+]
+
+# Pointwise (1x1) ConvGradW / ConvGradX routed to RedMulE. The PULP versions
+# (PULPFloatPWConvGradW2DBindings / PULPFloatPWConvGradX2DBindings) use the
+# same ConvChecker signature, so the binding is identical apart from which
+# template -> kernel symbol is selected.
+RedmulePWConvGradW2DBindings = [
+ NodeBinding(ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+ ConvGradTemplate.referencePWConvGradW2DTemplate, ClusterTransformer)
+]
+
+RedmulePWConvGradX2DBindings = [
+ NodeBinding(ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+ ConvGradTemplate.referencePWConvGradX2DTemplate, ClusterTransformer)
+]
diff --git a/Deeploy/Targets/Redmule/Deployer.py b/Deeploy/Targets/Redmule/Deployer.py
new file mode 100644
index 00000000..89ba2b92
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Deployer.py
@@ -0,0 +1,59 @@
+# ----------------------------------------------------------------------
+#
+# File: Deployer.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
+from Deeploy.Targets.Redmule.TopologyOptimizationPasses.Passes import RedMuleGEMMTransposePass
+
+
+class RedmuleDeployer(PULPDeployer):
+
+ def __init__(self,
+ graph: gs.Graph,
+ deploymentPlatform: DeploymentPlatform,
+ inputTypes: Dict[str, Type[Pointer]],
+ loweringOptimizer: TopologyOptimizer,
+ scheduler: Callable = lambda graph: list(graph.nodes),
+ name: str = 'DeeployNetwork',
+ default_channels_first = False,
+ deeployStateDir: str = "DeeployStateDir",
+ inputOffsets = {}):
+ super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
+ default_channels_first, deeployStateDir, inputOffsets)
+
+ self.loweringOptimizer.passes += [
+ # RedMuleAdjustWeightMemoryLayoutPass is currently not registered:
+ # it transposes Conv weights from [F,P,Q,C] to [P,Q,C,F] for the
+ # RedMulE accelerator, but Conv is back on PULPClusterEngine (see
+ # Engine.RedmuleMapping for why) and PULP expects [F,P,Q,C].
+ # Restore alongside the Conv mapping when RedmuleConv2DTileConstraint
+ # learns spatial tiling.
+ RedMuleGEMMTransposePass("Redmule")
+ ]
diff --git a/Deeploy/Targets/Redmule/Engine.py b/Deeploy/Targets/Redmule/Engine.py
new file mode 100644
index 00000000..9b929ab4
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Engine.py
@@ -0,0 +1,99 @@
+# ----------------------------------------------------------------------
+#
+# File: Engine.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper
+from Deeploy.Targets.Generic.Layers import ConvGradWLayer, ConvGradXLayer, ConvLayer, GEMMLayer, MatMulLayer
+from Deeploy.Targets.Generic.Parsers import MatMulParser
+from Deeploy.Targets.PULPOpen.Parsers import PULPFPConv2DParser, PULPPWConvGradW2DParser, PULPPWConvGradX2DParser
+from Deeploy.Targets.PULPOpen.Platform import ConvGradWMapper as PULPConvGradWMapper, \
+ ConvGradXMapper as PULPConvGradXMapper, DwConvGradWMapper, DwConvGradxMapper
+from Deeploy.Targets.Redmule.Parsers import GEMMRedmuleParser
+from Deeploy.Targets.Redmule.Tiler import RedmuleConvTilingReadyBindings, RedmuleGEMMTilingReadyBindings, \
+ RedmuleMatMulTilingReadyBindings, RedmulePWConvGradW2DTilingReadyBindings, RedmulePWConvGradX2DTilingReadyBindings
+
+MatMulRedmuleMapper = NodeMapper(MatMulParser(), RedmuleMatMulTilingReadyBindings)
+Conv2DRedmuleMapper = NodeMapper(PULPFPConv2DParser(), RedmuleConvTilingReadyBindings)
+GEMMMRedmuleMapper = NodeMapper(GEMMRedmuleParser(noBiasHoisting = False), RedmuleGEMMTilingReadyBindings)
+# Pointwise (1x1) ConvGradW / ConvGradX reuse PULP's parsers verbatim --
+# they only screen for kernel_shape=[1,1] / group=1 and populate the same
+# operatorRepresentation keys our Redmule templates consume.
+PWConvGradW2DRedmuleMapper = NodeMapper(PULPPWConvGradW2DParser(), RedmulePWConvGradW2DTilingReadyBindings)
+PWConvGradX2DRedmuleMapper = NodeMapper(PULPPWConvGradX2DParser(), RedmulePWConvGradX2DTilingReadyBindings)
+
+RedmuleMapping = {
+ 'MatMul': MatMulLayer([MatMulRedmuleMapper]),
+ # 'Conv' is currently routed to PULPClusterEngine (see comment below).
+ # The RedMulE-accelerated kernel and its template are kept in-tree
+ # (TargetLibraries/PULPOpen/src/Conv2d_Im2Col_fp32_Redmule.c +
+ # Deeploy/Targets/Redmule/Templates/ConvTemplate.py) so the path is
+ # ready to re-enable once RedmuleConv2DTileConstraint learns spatial
+ # tiling with halo regions. Today its addPolicyConstraint hard-pins
+ # inputHeightVar / inputWidthVar to the full feature-map size, which
+ # forces the entire activation tensor into L1 -- workable for tiny
+ # tokenizer-style Convs (CCT2 has 8x8 inputs and L1=128 KiB fits),
+ # but ResNet8 / MobileNet middle layers exceed L1 immediately
+ # (32x32x16 input + 32x32x16 output alone is 128 KiB). PULP's
+ # Conv2DTileConstraint already supports spatial halos, so falling
+ # back keeps the bigger Conv-heavy training fixtures tilable while
+ # MatMul / Gemm continue to bind to RedMulE.
+ #
+ # When that tile-constraint upgrade lands, restore:
+ # 'Conv': ConvLayer([Conv2DRedmuleMapper]),
+ # and the matching RedMuleAdjustWeightMemoryLayoutPass in Deployer.py.
+ 'Gemm': GEMMLayer([GEMMMRedmuleMapper]),
+ # NOTE: ConvGradW / ConvGradX are intentionally NOT mapped here.
+ # _selectEngine() is first-match across engines, so putting them on the
+ # RedmuleEngine would route every 3x3 / depthwise ConvGrad through this
+ # engine's layer and never let PULPClusterEngine see them. We tried a
+ # "complete" RedmuleEngine layer ([PW_Redmule, DW_PULP, regular_PULP])
+ # but the resulting tiler hit infeasible memory-pattern constraints on
+ # ResNet8 / MobileNet despite using identical mapper instances to
+ # PULP -- some interaction between the layer object identity and the
+ # tiling-pattern solver we couldn't fully diagnose.
+ #
+ # Instead, the RedMulE PWConvGrad mappers are inserted into the
+ # existing PULPClusterEngine ConvGradW / ConvGradX layers at position 0
+ # in RedmulePlatform.__init__. That keeps the layer object identical
+ # to the pure-PULP path (matters for the tiler) while still ensuring
+ # 1x1 ConvGrads bind to the RedMulE kernels.
+}
+
+_includeList = []
+
+_redmuleInitCode = r"""
+// Redmule engine initialization
+"""
+
+
+class RedmuleEngine(DeploymentEngine):
+
+ def __init__(self,
+ name: str,
+ Mapping = RedmuleMapping,
+ initCode: str = _redmuleInitCode,
+ includeList: List[str] = _includeList) -> None:
+ super().__init__(name, Mapping, initCode, includeList)
diff --git a/Deeploy/Targets/Redmule/Parsers.py b/Deeploy/Targets/Redmule/Parsers.py
new file mode 100644
index 00000000..d359bbbd
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Parsers.py
@@ -0,0 +1,114 @@
+# ----------------------------------------------------------------------
+#
+# File: BasicParsers.py
+#
+# Last edited: 15.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Authors:
+# - Moritz Scherer, ETH Zurich
+# - Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t
+from Deeploy.DeeployTypes import NetworkContext
+from Deeploy.Targets.Generic.Parsers import MatMulParser
+
+
+class GEMMRedmuleParser(MatMulParser):
+
+ def __init__(self, noBiasHoisting = True):
+ # Order matters: super().__init__() of MatMulParser also writes
+ # self.noBiasHoisting from its own default, so call super first and
+ # then overwrite, otherwise our flag gets clobbered to True.
+ super().__init__(noBiasHoisting = noBiasHoisting)
+ self.noBiasHoisting = noBiasHoisting
+
+ def parseNode(self, node: gs.Node) -> (bool):
+
+ ret = all([len(node.inputs) >= 2, len(node.outputs) == 1, node.attrs['alpha'] == 1])
+
+ if ret:
+ if 'transA' in node.attrs:
+ self.operatorRepresentation['transA'] = node.attrs['transA']
+ else:
+ self.operatorRepresentation['transA'] = 0
+
+ if 'transB' in node.attrs:
+ self.operatorRepresentation['transB'] = node.attrs['transB']
+ else:
+ self.operatorRepresentation['transB'] = 0
+ if 'alpha' in node.attrs:
+ self.operatorRepresentation['alpha'] = node.attrs['alpha']
+ else:
+ self.operatorRepresentation['alpha'] = 1
+ if 'beta' in node.attrs:
+ self.operatorRepresentation['beta'] = node.attrs['beta']
+ else:
+ self.operatorRepresentation['beta'] = 1
+
+ return ret
+
+ def parseNodeCtxt(self,
+ ctxt: NetworkContext,
+ node: gs.Node,
+ channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+ newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+ if ret:
+ inputs = ['A', 'B']
+ outputs = ['data_out']
+
+ for idx, inputNode in enumerate(node.inputs):
+ if idx < len(inputs):
+ self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+ for idx, outputNode in enumerate(node.outputs):
+ self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name
+
+ if len(node.inputs) == 3:
+ self.operatorRepresentation['C'] = newCtxt.lookup(node.inputs[2].name).name
+ elif not self.noBiasHoisting:
+ # Hoist a zero C tensor whose shape matches the GEMM output, so
+ # the bias-required RedmuleGEMMTileConstraint and the existing
+ # 3-operand kernel template can run unchanged on bias-less
+ # Gemm nodes (e.g. backward GradFusedMatMul rewrites in CCT
+ # training graphs that emit Y = A @ B with no C).
+ outShape = node.outputs[0].shape
+ values = np.zeros(outShape, dtype = np.float32)
+ zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values)
+ newCtxt.hoistConstant(zeroTensor, _type = PointerClass(float32_t))
+ # Also wire the hoisted Constant into the gs.Node inputs so the
+ # tiler picks it up via its `node.inputs + node.outputs` walk,
+ # AND register the Gemm as a user of the new buffer so the
+ # MemoryConstraintFlow's kill-set analysis (which walks
+ # `_users`) can find a consumer for it. Without these the
+ # tiler / flow analyzer KeyError or assert on the C tensor.
+ node.inputs.append(zeroTensor)
+ newCtxt.addUser(f'{node.name}_C_Tensor', node)
+ self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
+
+ self.operatorRepresentation['size'] = np.prod(newCtxt.lookup(node.inputs[0].name).shape)
+
+ return newCtxt, ret
diff --git a/Deeploy/Targets/Redmule/Platform.py b/Deeploy/Targets/Redmule/Platform.py
new file mode 100644
index 00000000..8906b6d2
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Platform.py
@@ -0,0 +1,71 @@
+# ----------------------------------------------------------------------
+#
+# File: Platform.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import TopologyOptimizer
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPConstantBuffer, PULPOptimizer, PULPPlatform, \
+ PULPStructBuffer, PULPTransientBuffer, PULPVariableBuffer
+from Deeploy.Targets.Redmule.Engine import PWConvGradW2DRedmuleMapper, PWConvGradX2DRedmuleMapper, RedmuleEngine
+
+RedmuleOptimizer = TopologyOptimizer([*PULPOptimizer.passes])
+
+
+class RedmulePlatform(PULPPlatform):
+
+ def __init__(self,
+ engines = [RedmuleEngine("Redmule"), PULPClusterEngine("PULPCluster")],
+ variableBuffer = PULPVariableBuffer,
+ constantBuffer = PULPConstantBuffer,
+ structBuffer = PULPStructBuffer,
+ transientBuffer = PULPTransientBuffer) -> None:
+ super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
+
+ # Insert the RedMulE PWConvGrad mappers at position 0 of the
+ # PULPClusterEngine's ConvGradW / ConvGradX layer mapper lists.
+ # See the comment on RedmuleMapping in Engine.py: we cannot route
+ # those op types through RedmuleEngine itself without confusing the
+ # tiler, so we mutate the (still-pure-PULP) PULPClusterEngine layer
+ # in place. Order matters: PW Redmule must come before PULP's PW
+ # mapper, so 1x1 ConvGrads bind to the RedMulE kernel; non-PW
+ # variants fall through to PULP's DW / regular mappers as before.
+ pulp_cluster = next((e for e in self.engines if e.name == "PULPCluster"), None)
+ if pulp_cluster is not None:
+ # Both PWConvGradW and PWConvGradX RedMulE mappers are hooked up
+ # to PULPCluster's existing layer mapper lists. ConvGradW was
+ # disabled temporarily in 68d1639 because its template sized the
+ # transpose buffer at C_in * H_in * W_in, which over-counted the
+ # actual footprint for stride > 1 1x1 convs (ResNet8 layer2/3
+ # downsample) and tripped tiler infeasibility on the regular-Conv
+ # backward pattern memory. After dropping that to the exact
+ # C_in * H_out * W_out and teaching the kernel to sample X at
+ # strided positions, the W path is back in.
+ for op_type, redmule_mapper in (
+ ("ConvGradW", PWConvGradW2DRedmuleMapper),
+ ("ConvGradX", PWConvGradX2DRedmuleMapper),
+ ):
+ layer_factory = pulp_cluster.Mapping.get(op_type)
+ if layer_factory is not None and hasattr(layer_factory, "maps"):
+ # Avoid double-inserting across repeated platform inits.
+ if redmule_mapper not in layer_factory.maps:
+ layer_factory.maps.insert(0, redmule_mapper)
diff --git a/Deeploy/Targets/Redmule/Templates/ConvGradTemplate.py b/Deeploy/Targets/Redmule/Templates/ConvGradTemplate.py
new file mode 100644
index 00000000..b2246cdb
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/ConvGradTemplate.py
@@ -0,0 +1,148 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Templates that route Pointwise (1x1) ConvGradW / ConvGradX to RedMulE.
+
+Both kernels reuse the existing PULPOpen tile constraints
+(PWConvGradWTileConstraint / PWConvGradXTileConstraint) so the tile-shape
+search is identical to the pulp-trainlib variants; only the kernel body
+calls into PWConvGrad_fp32_Redmule.c instead, which materialises the
+necessary transpose into a transient buffer and then fires a single
+RedMulE GEMM.
+"""
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class RedmulePWConvGradWTemplate(NodeTemplate):
+ """RedMulE pointwise ConvGradW: dW = dY @ X^T (1x1 kernel).
+
+ Reserves a C_in * H_in * W_in transient buffer in L1 to hold the
+ transposed input that the RedMulE GEMM consumes. At the kernel side
+ PWConvGradW2d_fp32_fp32_fp32_CHW_Redmule (in
+ TargetLibraries/PULPOpen/src/PWConvGrad_fp32_Redmule.c) builds the
+ transpose in parallel across the 8 cluster cores and then triggers
+ one RedMulE call.
+ """
+
+ def __init__(self, templateStr: str):
+ super().__init__(templateStr)
+
+ # Must stay in sync with PWGW_CHUNK_P in PWConvGrad_fp32_Redmule.c.
+ PWGW_CHUNK_P = 16
+
+ @staticmethod
+ def computeTransientBuffersSize(
+ ctxt: NetworkContext,
+ operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+ # Fixed-size chunk scratch: PWGW_CHUNK_P rows of [C_in] for the
+ # X-sampled-and-transposed slice + PWGW_CHUNK_P rows of [C_out] for
+ # the dY view (used by the multi-chunk path when P > CHUNK_P).
+ # Independent of the layer's feature-map area -- crucial on
+ # MobileNetV1 early blocks where H_out * W_out can hit 48*48 and a
+ # full transpose buffer would blow L1.
+ wbytes = operatorRepresentation["data_in_type"].typeWidth // 8
+ chunk = RedmulePWConvGradWTemplate.PWGW_CHUNK_P
+ bt_dim = wbytes * chunk * (operatorRepresentation['ch_im_in'] +
+ operatorRepresentation['ch_im_out'])
+ bt_name = operatorRepresentation['nodeName'] + "_transpose_buffer"
+ return [(bt_name, bt_dim)]
+
+ def hoistTransientBuffers(
+ self, ctxt: NetworkContext,
+ operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+ bt_name, bt_dim = RedmulePWConvGradWTemplate.computeTransientBuffersSize(
+ ctxt, operatorRepresentation)[0]
+ ctxt.hoistTransientBuffer(bt_name, bt_dim)
+ operatorRepresentation['transposeBuffer'] = bt_name
+ operatorRepresentation['transposeBufferSize'] = bt_dim
+ return ctxt, operatorRepresentation, [bt_name]
+
+
+class RedmulePWConvGradXTemplate(NodeTemplate):
+ """RedMulE pointwise ConvGradX: dX = scatter(W^T @ dY) (1x1 kernel).
+
+ For stride 1 the transpose buffer only holds C_in * C_out floats (the
+ transposed weight matrix); the RedMulE GEMM writes the [C_in, H*W]
+ result straight into pGradIn.
+
+ For stride > 1 the GEMM output is the *dense* [C_in, H_out * W_out]
+ matrix and must be scattered into the [C_in, H_in, W_in] dX tensor at
+ the strided positions (the rest of dX stays zero). In that case the
+ transpose buffer is also reused to hold the dense GEMM result, so the
+ template reserves C_in * C_out + C_in * H_out * W_out floats. At
+ stride 1 the dense buffer is unused but the over-allocation is small
+ enough to keep the worst-case size simple.
+ """
+
+ def __init__(self, templateStr: str):
+ super().__init__(templateStr)
+
+ @staticmethod
+ def computeTransientBuffersSize(
+ ctxt: NetworkContext,
+ operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+ wt_elts = operatorRepresentation['ch_im_in'] * operatorRepresentation['ch_im_out']
+ dense_elts = operatorRepresentation['ch_im_in'] * operatorRepresentation[
+ 'dim_im_out_x'] * operatorRepresentation['dim_im_out_y']
+ bt_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * (wt_elts + dense_elts)
+ bt_name = operatorRepresentation['nodeName'] + "_transpose_buffer"
+ return [(bt_name, bt_dim)]
+
+ def hoistTransientBuffers(
+ self, ctxt: NetworkContext,
+ operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+ bt_name, bt_dim = RedmulePWConvGradXTemplate.computeTransientBuffersSize(
+ ctxt, operatorRepresentation)[0]
+ ctxt.hoistTransientBuffer(bt_name, bt_dim)
+ operatorRepresentation['transposeBuffer'] = bt_name
+ operatorRepresentation['transposeBufferSize'] = bt_dim
+ return ctxt, operatorRepresentation, [bt_name]
+
+
+referencePWConvGradW2DTemplate = RedmulePWConvGradWTemplate("""
+// 2D FP Pointwise ConvGradW (1x1) CHW via RedMulE (Name: ${nodeName}, Op: ${nodeOp})
+${grad_out_type.typeName} ref_${grad_weight}_${grad_out} = ${grad_out};
+${data_in_type.typeName} ref_${grad_weight}_${data_in} = ${data_in};
+${grad_weight_type.typeName} ref_${grad_weight}_out = ${grad_weight};
+
+for (uint32_t n = 0; n < ${batch}; ++n) {
+ PWConvGradW2d_fp${grad_out_type.referencedType.typeWidth}_fp${data_in_type.referencedType.typeWidth}_fp${grad_weight_type.referencedType.typeWidth}_CHW_Redmule(
+ ref_${grad_weight}_${grad_out},
+ ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out},
+ ref_${grad_weight}_${data_in},
+ ${dim_im_in_x}, ${dim_im_in_y}, ${ch_im_in},
+ ref_${grad_weight}_out,
+ ${transposeBuffer}
+ );
+
+ ref_${grad_weight}_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x};
+ ref_${grad_weight}_${data_in} += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x};
+}
+""")
+
+referencePWConvGradX2DTemplate = RedmulePWConvGradXTemplate("""
+// 2D FP Pointwise ConvGradX (1x1) CHW via RedMulE (Name: ${nodeName}, Op: ${nodeOp})
+${grad_out_type.typeName} ref_${grad_in}_${grad_out} = ${grad_out};
+${weight_type.typeName} ref_${grad_in}_${weight} = ${weight};
+${grad_in_type.typeName} ref_${grad_in}_out = ${grad_in};
+
+for (uint32_t n = 0; n < ${batch}; ++n) {
+ PWConvGradX2d_fp${grad_out_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${grad_in_type.referencedType.typeWidth}_CHW_Redmule(
+ ref_${grad_in}_${grad_out},
+ ${dim_im_out_x}, ${dim_im_out_y}, ${ch_im_out},
+ ref_${grad_in}_${weight},
+ ${ch_im_in},
+ ref_${grad_in}_out,
+ ${dim_im_in_x}, ${dim_im_in_y},
+ ${transposeBuffer}, ${transposeBufferSize}
+ );
+
+ ref_${grad_in}_${grad_out} += ${ch_im_out} * ${dim_im_out_y} * ${dim_im_out_x};
+ ref_${grad_in}_out += ${ch_im_in} * ${dim_im_in_y} * ${dim_im_in_x};
+}
+""")
diff --git a/Deeploy/Targets/Redmule/Templates/ConvTemplate.py b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py
new file mode 100644
index 00000000..3ce9d61e
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py
@@ -0,0 +1,98 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTemplate.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class RedmuleFloatConvIm2ColTemplate(NodeTemplate):
+
+ def __init__(self, templateStr):
+ super().__init__(templateStr)
+
+ @staticmethod
+ def computeTransientBuffersSize(
+ ctxt: NetworkContext,
+ operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+ # Streaming im2col buffer: IM2COL_CHUNK_ROWS rows of K = C*P*Q FP32
+ # values. Must stay in sync with the IM2COL_CHUNK_ROWS macro in
+ # Conv2d_Im2Col_fp32_Redmule.c. A full-image im2col would blow L1
+ # for non-trivial Conv layers (e.g. ResNet8 with H_out*W_out=1024
+ # and K=144 -> 576 KiB), which made the tiler infeasible; capping
+ # the buffer at 16 rows keeps every Conv layer tilable, at the cost
+ # of a few extra RedMulE MMIO triggers per layer.
+ IM2COL_CHUNK_ROWS = 16
+ k_per_row = (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] *
+ operatorRepresentation['dim_kernel_y'])
+ im2col_dim = 4 * IM2COL_CHUNK_ROWS * k_per_row
+ im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+ return [(im2col_name, im2col_dim)]
+
+ def hoistTransientBuffers(self, ctxt: NetworkContext,
+ operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+ im2col_name, im2col_dim = RedmuleFloatConvIm2ColTemplate.computeTransientBuffersSize(
+ ctxt, operatorRepresentation)[0]
+ ctxt.hoistTransientBuffer(im2col_name, im2col_dim)
+
+ operatorRepresentation['ctxtBuffer'] = im2col_name
+ operatorRepresentation['ctxtBufferSize'] = im2col_dim
+ return ctxt, operatorRepresentation, [im2col_name]
+
+
+reference2DIm2ColTemplate = RedmuleFloatConvIm2ColTemplate("""
+// 2D FP Conv HWC Parallel with Im2Col (Name: ${nodeName}, Op: ${nodeOp})
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+
+ Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC_8_Redmule(
+ ref_${data_out}_${data_in},
+ ${dim_im_in_y},
+ ${dim_im_in_x},
+ ${ch_im_in},
+ ${weight},
+ ${dim_kernel_y},
+ ${dim_kernel_x},
+ ${stride_y},
+ ${stride_x},
+ ${bias},
+ ${has_bias},
+ ref_${data_out}_${data_out},
+ ${ch_im_out},
+ ${padding_y_top},
+ ${padding_y_bottom},
+ ${padding_x_left},
+ ${padding_x_right},
+ ${ctxtBuffer}
+ );
+
+ ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
+ ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
+}
+""")
diff --git a/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py
new file mode 100644
index 00000000..ba41ab76
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py
@@ -0,0 +1,61 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMul.py.py
+#
+# Last edited: 27.01.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the Licens
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// GEMM using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+
+if (${nodeName}_core_id == 0) {
+ for(uint32_t b=0; b<${batch}; b++) {
+ ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
+ ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
+ ${C_type.typeName} batch_C = ${C} + b * ${M} * ${O};
+ ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
+
+ % if beta == 0:
+ MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule(
+ (const float32_t *) batch_A,
+ (const float32_t *) batch_B,
+ (float32_t *) batch_out,
+ ${M},
+ ${N},
+ ${O}
+ );
+ % else:
+ Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule(
+ (const float32_t *) batch_A,
+ (const float32_t *) batch_B,
+ (const float32_t *) batch_C,
+ (float32_t *) batch_out,
+ ${M},
+ ${N},
+ ${O}
+ );
+ % endif
+ }
+}
+""")
diff --git a/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py
new file mode 100644
index 00000000..cb077ca8
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMul.py.py
+#
+# Last edited: 27.01.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the Licens
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Matmul using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_num_cores = NUM_CORES;
+
+if (${nodeName}_core_id == 0) {
+ for(uint32_t b=0; b<${batch}; b++) {
+ ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
+ ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
+ ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
+
+ MatMul_fp32_fp32_fp32_Redmule(
+ (const float32_t *) batch_A,
+ (const float32_t *) batch_B,
+ (float32_t *) batch_out,
+ ${M},
+ ${N},
+ ${O}
+ );
+ }
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/Templates/__init__.py b/Deeploy/Targets/Redmule/Templates/__init__.py
new file mode 100644
index 00000000..a73187ca
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py
new file mode 100644
index 00000000..1b3a93f6
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py
@@ -0,0 +1,279 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTileConstraint.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+ VariableReplacementScheme
+
+
+class RedmuleConv2DTileConstraint(TileConstraint):
+
+ @staticmethod
+ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+ # Get to-be-tiled tensor's buffers
+ inputBufferName = parseDict['data_in']
+ weightBufferName = parseDict['weight']
+ outputBufferName = parseDict['data_out']
+
+ strides = parseDict["strides"]
+ padding = parseDict["pads"]
+ dilation = parseDict["dilations"]
+
+ # Add I/O dimensions to the model as variables
+ for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+ tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+ inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+ inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+ inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+ inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+ # Updated dimension indexes for (H, W, Cin, Cout) format
+ weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+ weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+ weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+ weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
+
+ outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+ outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+ outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+ outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+ # Map output dims to inputs dims
+ tilerModel.addConstraint(outputBatchVar == inputBatchVar) # Batch
+ tilerModel.addConstraint(outputChannelVar == weightOutChannelVar) # Output Channel (now at index 3)
+
+ inputBuffer = ctxt.lookup(inputBufferName)
+
+ effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
+ effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
+
+ tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (weightHeightVar - 1) - 1) // strides[0] + 1))
+ tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (weightWidthVar - 1) - 1) // strides[1] + 1))
+
+ return tilerModel
+
+ @staticmethod
+ def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+ # Get to-be-tiled tensor's buffers
+ inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+ weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+ inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+ inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+ inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+ # Updated dimension indexes for (H, W, Cin, Cout) format
+ weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0)
+ weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1)
+ weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2)
+ weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 3)
+
+ strides = parseDict["strides"]
+ padding = parseDict["pads"]
+
+ tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in'])
+ # RW: Conv only tiled on outchannel
+ tilerModel.addConstraint(inputHeightVar == parseDict['dim_im_in_x'])
+ tilerModel.addConstraint(inputWidthVar == parseDict['dim_im_in_y'])
+ tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in'])
+
+ tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x'])
+ tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y'])
+ tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in'])
+
+ outChannel = parseDict["ch_im_out"]
+ if outChannel >= 12:
+ tilerModel.addTileSizeDivisibleConstraint(parseDict,
+ "ch_im_out",
+ weightOutChannelVar,
+ 12,
+ strategy = PerformanceHint(priority = 1))
+ else:
+ tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max(), strategy = PerformanceHint(1))
+
+ return tilerModel
+
+ @staticmethod
+ def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+ ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+ inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+ weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+ symbolicParseDict = parseDict.copy()
+ symbolicParseDict['dim_im_in_x'] = tilerModel.getTensorDimVar(inputBuffer.name, 1)
+ # Using updated dimension indexes for kernel dimensions
+ symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 0)
+ symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 1)
+
+ return symbolicParseDict
+
+ @staticmethod
+ def computeMargins(kernelShape: Tuple[int, ...]) -> Tuple[int, ...]:
+ if kernelShape[1] % 2 == 0:
+ leftMargin = 0
+ rightMargin = 0
+ else:
+ leftMargin = ((kernelShape[1]) // 2)
+ rightMargin = ((kernelShape[1]) // 2)
+
+ if kernelShape[0] % 2 == 0:
+ topMargin = 0
+ bottomMargin = 0
+ else:
+ topMargin = ((kernelShape[0]) // 2)
+ bottomMargin = ((kernelShape[0]) // 2)
+
+ return leftMargin, rightMargin, topMargin, bottomMargin
+
+ @staticmethod
+ def computeInputCube(kernelShape: Tuple[int, ...], pads: Tuple[int, ...], strides: Tuple[int, ...],
+ weightChannels: int, outputCube: HyperRectangle,
+ outputDims: Tuple[int, ...]) -> Tuple[HyperRectangle, Tuple[int, ...]]:
+
+ (BatchOffset, HOffset, WOffset, COffset) = outputCube.offset
+ (BatchSize, HSize, WSize, CSize) = outputCube.dims
+
+ leftMargin, rightMargin, topMargin, bottomMargin = RedmuleConv2DTileConstraint.computeMargins(kernelShape)
+
+ padding_top = (HOffset == 0) * pads[0]
+ padding_bottom = (HOffset + HSize == outputDims[1]) * pads[2]
+
+ padding_left = (WOffset == 0) * pads[1]
+ padding_right = (WOffset + WSize == outputDims[2]) * pads[3]
+
+ inputHOffset = HOffset * strides[0] - topMargin * (HOffset != 0)
+ inputWOffset = WOffset * strides[1] - leftMargin * (WOffset != 0)
+
+ inputHSize = HSize * strides[0] + (topMargin + bottomMargin) - (padding_top + padding_bottom)
+ inputWSize = WSize * strides[1] + (leftMargin + rightMargin) - (padding_left + padding_right)
+
+ InCube = HyperRectangle((BatchOffset, inputHOffset, inputWOffset, 0),
+ (BatchSize, inputHSize, inputWSize, weightChannels))
+
+ return InCube, (padding_left, padding_right, padding_top, padding_bottom)
+
+ @classmethod
+ def serializeTilingSolution(
+ cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+ targetMemLevel: str, ctxt: NetworkContext,
+ operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+ outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+ addrNames = ['data_in', 'weight', 'data_out']
+ inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+ operatorRepresentation, addrNames)
+
+ varWeight = operatorRepresentation['weight']
+ varOut = operatorRepresentation['data_out']
+
+ inputInCubes = []
+ inputWeightCubes = []
+ replacements: Dict[str, List[int]] = {
+ "dim_im_in_x": [],
+ "dim_im_in_y": [],
+ "dim_im_out_x": [],
+ "dim_im_out_y": [],
+ "ch_im_out": [],
+ "padding_y_top": [],
+ "padding_y_bottom": [],
+ "padding_x_left": [],
+ "padding_x_right": []
+ }
+
+ replacementTypes = {
+ "dim_im_in_x": PointerClass(uint16_t),
+ "dim_im_in_y": PointerClass(uint16_t),
+ "dim_im_out_x": PointerClass(uint16_t),
+ "dim_im_out_y": PointerClass(uint16_t),
+ "ch_im_out": PointerClass(uint16_t),
+ "padding_y_top": PointerClass(uint8_t),
+ "padding_y_bottom": PointerClass(uint8_t),
+ "padding_x_left": PointerClass(uint8_t),
+ "padding_x_right": PointerClass(uint8_t)
+ }
+
+ # Updated dimension indexes for (H, W, Cin, Cout) format
+ weightH = ctxt.lookup(varWeight).shape[0] # Now index 0
+ weightW = ctxt.lookup(varWeight).shape[1] # Now index 1
+ weightC = ctxt.lookup(varWeight).shape[2] # Now index 2 (Cin)
+
+ pads = operatorRepresentation['pads']
+ strides = operatorRepresentation['strides']
+
+ for cube in outputCubes:
+ (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+ (BatchSize, HSize, WSize, CSize) = cube.dims
+
+ InCube, padding_tuple = RedmuleConv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides,
+ weightC, cube,
+ ctxt.lookup(varOut).shape)
+
+ padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+ replacements['dim_im_in_x'].append(InCube.dims[1])
+ replacements['dim_im_in_y'].append(InCube.dims[2])
+ replacements['dim_im_out_x'].append(HSize)
+ replacements['dim_im_out_y'].append(WSize)
+ replacements['ch_im_out'].append(CSize)
+
+ replacements['padding_y_top'].append(padding_top)
+ replacements['padding_y_bottom'].append(padding_bottom)
+ replacements['padding_x_left'].append(padding_left)
+ replacements['padding_x_right'].append(padding_right)
+
+ inputInCubes.append(InCube)
+
+ # Updated WeightCube for (H, W, Cin, Cout) format
+ # COffset is now applied to dimension 3 (Cout)
+ WeightCube = HyperRectangle((0, 0, 0, COffset), (weightH, weightW, weightC, CSize))
+
+ inputWeightCubes.append(WeightCube)
+
+ inputLoadSchedule = []
+ outputLoadSchedule = []
+
+ for a, b in zip(inputInCubes, inputWeightCubes):
+ inputLoadSchedule.append({"data_in": a, "weight": b})
+
+ for out in outputCubes:
+ outputLoadSchedule.append({"data_out": out})
+
+ tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+ variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+ return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py
new file mode 100644
index 00000000..fbae4824
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py
@@ -0,0 +1,198 @@
+# ----------------------------------------------------------------------
+#
+# File: GEMMTileConstraint.py
+#
+# Last edited: 02.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+ VariableReplacementScheme
+
+
+class RedmuleGEMMTileConstraint(TileConstraint):
+
+ @staticmethod
+ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+ # Get to-be-tiled tensor's buffers
+ bufferA = ctxt.lookup(name = parseDict['A'])
+ bufferB = ctxt.lookup(name = parseDict['B'])
+ bufferC = ctxt.lookup(name = parseDict['C'])
+ outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+ # Add I/O dimensions to the model as variables
+ for bufferName in [bufferA.name, bufferB.name, bufferC.name, outputBuffer.name]:
+ tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+ dimOffsetA = len(bufferA.shape) - 2
+ dimOffsetB = len(bufferB.shape) - 2
+ dimOffsetC = len(bufferC.shape) - 2
+ dimOffsetOut = len(outputBuffer.shape) - 2
+
+ AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
+ ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+ dimIdx = dimOffsetA + 1 - parseDict['transA'])
+ BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB'])
+ BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+ dimIdx = dimOffsetB + 1 - parseDict['transB'])
+ outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut)
+ outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut + 1)
+
+ # Map output dims to inputs dims
+ tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar)
+ tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar)
+
+ # Add GEMM Geometrical constraints
+ tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
+
+ addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
+ addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
+ tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
+ tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
+
+ return tilerModel
+
+ @staticmethod
+ def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+ from Deeploy.TilingExtension.TilerModel import PerformanceHint
+
+ bufferA = ctxt.lookup(name = parseDict['A'])
+ bufferB = ctxt.lookup(name = parseDict['B'])
+
+ tensorsShapeLen = min(len(bufferA.shape), len(bufferB.shape))
+
+ dimOffsetA = len(bufferA.shape) - 2
+ dimOffsetB = len(bufferB.shape) - 2
+
+ AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
+ ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+ dimIdx = dimOffsetA + 1 - parseDict['transA'])
+ BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB'])
+ BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+ dimIdx = dimOffsetB + 1 - parseDict['transB'])
+
+ # VIC: We don't want to deal with intermediate results between kernel calls
+ tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
+ tilerModel.addConstraint(BFirstDimVar == parseDict['N'])
+
+ tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy = PerformanceHint(1))
+ tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1))
+
+ return tilerModel
+
+ @classmethod
+ def serializeTilingSolution(
+ cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+ targetMemLevel: str, ctxt: NetworkContext,
+ operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+ outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+ addrNames = ['A', 'B', 'C', 'data_out']
+ inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+ operatorRepresentation, addrNames)
+
+ transA = operatorRepresentation['transA']
+ transB = operatorRepresentation['transB']
+
+ varA = operatorRepresentation['A']
+ varB = operatorRepresentation['B']
+
+ if transA == 0:
+ NSize = ctxt.lookup(varA).shape[-1]
+ else:
+ NSize = ctxt.lookup(varA).shape[-2]
+
+ NOffset = 0
+
+ inputACubes = []
+ inputBCubes = []
+ inputAddCubes = []
+
+ replacements = {"M": [], "O": [], "batch": []}
+
+ # Every output is constructed by a pair of inputs. Reconstruct this pair.
+ for cube in outputCubes:
+
+ BSize = 1
+ BOffset = 0
+ BatchSize = 1
+ BatchOffset = 0
+
+ if len(cube.offset) == 2:
+ (MOffset, OOffset) = cube.offset
+ (MSize, OSize) = cube.dims
+ elif len(cube.offset) == 3:
+ (BatchOffset, MOffset, OOffset) = cube.offset
+ (BatchSize, MSize, OSize) = cube.dims
+ else:
+ (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
+ (BatchSize, BSize, MSize, OSize) = cube.dims
+
+ replacements["M"].append(MSize)
+ replacements["O"].append(OSize)
+ replacements["batch"].append(BSize)
+
+ if transA == 0:
+ ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize))
+ else:
+ ACube = HyperRectangle((BatchOffset, BOffset, NOffset, MOffset), (BatchSize, BSize, NSize, MSize))
+
+ if transB == 0:
+ BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize))
+ else:
+ BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize))
+
+ CCube = HyperRectangle(cube.offset, cube.dims)
+
+ inputACubes.append(ACube)
+ inputBCubes.append(BCube)
+ inputAddCubes.append(CCube)
+
+ inputLoadSchedule = []
+ outputLoadSchedule = []
+
+ replacements["N"] = [NSize] * len(outputCubes)
+
+ replacementTypes = {
+ "M": PointerClass(uint16_t),
+ "N": PointerClass(uint16_t),
+ "O": PointerClass(uint16_t),
+ "batch": PointerClass(uint8_t)
+ }
+
+ for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
+ inputLoadSchedule.append({"A": a, "B": b, "C": c})
+
+ for out in outputCubes:
+ outputLoadSchedule.append({"data_out": out})
+
+ schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+
+ return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py
new file mode 100644
index 00000000..1b14ccc4
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py
@@ -0,0 +1,197 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMulTileConstraint.py
+#
+# Last edited: 28.04.2025
+#
+# Copyright (C) 2025, ETH Zurich and University of Bologna.
+#
+# Author: [Your Name]
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import int8_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+ VariableReplacementScheme
+
+
+class RedmuleMatmulTileConstraint(TileConstraint):
+
+ @staticmethod
+ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+ # Get to-be-tiled tensor's buffers
+ bufferA = ctxt.lookup(name = parseDict['A'])
+ bufferB = ctxt.lookup(name = parseDict['B'])
+ outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+ # Add I/O dimensions to the model as variables
+ for _buffer in [bufferA, bufferB, outputBuffer]:
+ tilerModel.addTensorDimToModel(ctxt, _buffer.name)
+
+ tensorsShapeLen = len(bufferA.shape)
+
+ AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+ dimIdx = (tensorsShapeLen - 2) + parseDict['transA'])
+ ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+ dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
+ BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+ dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
+ BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+ dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
+ outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2))
+ outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1))
+
+ # Map output dims to inputs dims
+ for idx in range(tensorsShapeLen - 2):
+ tilerModel.addConstraint(
+ tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
+ tensorName = bufferA.name, dimIdx = idx))
+ tilerModel.addConstraint(
+ tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
+ tensorName = bufferB.name, dimIdx = idx))
+
+ tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar)
+ tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar)
+
+ # Add GEMM Geometrical constraints
+ tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
+
+ return tilerModel
+
+ @staticmethod
+ def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+ bufferA = ctxt.lookup(name = parseDict['A'])
+ bufferB = ctxt.lookup(name = parseDict['B'])
+
+ tensorsShapeLen = len(bufferA.shape)
+
+ AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+ dimIdx = (tensorsShapeLen - 2) + parseDict['transA'])
+ ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+ dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
+ BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+ dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
+ BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+ dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
+
+ # VIC: We don't want to deal with intermediate results between kernel calls
+ tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
+ tilerModel.addConstraint(BFirstDimVar == parseDict['N'])
+
+ # Hardware-specific constraints for 4x12 accelerator
+ tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1))
+
+ M_full_size = ctxt.lookup(bufferA.name).shape[(tensorsShapeLen - 2) + parseDict['transA']]
+ if M_full_size >= 16:
+ tilerModel.addTileSizeDivisibleConstraint(parseDict,
+ "M",
+ AFirstDimVar,
+ 16,
+ strategy = PerformanceHint(priority = 1))
+ else:
+ tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy = PerformanceHint(1))
+
+ N_full_size = ctxt.lookup(bufferB.name).shape[(tensorsShapeLen - 2) + parseDict['transB']]
+ if N_full_size >= 12:
+ tilerModel.addTileSizeDivisibleConstraint(parseDict,
+ "O",
+ BSecondDimVar,
+ 12,
+ strategy = PerformanceHint(priority = 1))
+ else:
+ tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1))
+
+ return tilerModel
+
+ @classmethod
+ def serializeTilingSolution(
+ cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+ targetMemLevel: str, ctxt: NetworkContext,
+ operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+ outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+ addrNames = ['A', 'B', 'data_out']
+ inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+ operatorRepresentation, addrNames)
+
+ varA = operatorRepresentation['A']
+
+ NSize = ctxt.lookup(varA).shape[-1]
+ NOffset = 0
+
+ inputACubes = []
+ inputBCubes = []
+
+ replacements = {"M": [], "O": [], "batch": []}
+
+ # Every output is constructed by a pair of inputs. Reconstruct this pair.
+ for cube in outputCubes:
+
+ BSize = 1
+ BOffset = 0
+ BatchSize = 1
+ BatchOffset = 0
+
+ if len(cube.offset) == 2:
+ (MOffset, OOffset) = cube.offset
+ (MSize, OSize) = cube.dims
+ elif len(cube.offset) == 3:
+ (BatchOffset, MOffset, OOffset) = cube.offset
+ (BatchSize, MSize, OSize) = cube.dims
+ else:
+ (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
+ (BatchSize, BSize, MSize, OSize) = cube.dims
+
+ replacements["M"].append(MSize)
+ replacements["O"].append(OSize)
+ replacements["batch"].append(BSize)
+
+ ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize))
+ BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize))
+
+ inputACubes.append(ACube)
+ inputBCubes.append(BCube)
+
+ inputLoadSchedule = []
+ outputLoadSchedule = []
+
+ replacements["N"] = [NSize] * len(outputCubes)
+
+ replacementTypes = {
+ "M": PointerClass(int8_t),
+ "N": PointerClass(int8_t),
+ "O": PointerClass(int8_t),
+ "batch": PointerClass(int8_t)
+ }
+
+ for a, b in zip(inputACubes, inputBCubes):
+ inputLoadSchedule.append({"A": a, "B": b})
+
+ for out in outputCubes:
+ outputLoadSchedule.append({"data_out": out})
+
+ schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+
+ return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/Redmule/TileConstraints/__init__.py b/Deeploy/Targets/Redmule/TileConstraints/__init__.py
new file mode 100644
index 00000000..a73187ca
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Redmule/Tiler.py b/Deeploy/Targets/Redmule/Tiler.py
new file mode 100644
index 00000000..5264c089
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Tiler.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------
+#
+# File: Tiler.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.Targets.PULPOpen.TileConstraints.ConvGradConstraint import PWConvGradWTileConstraint, \
+ PWConvGradXTileConstraint
+from Deeploy.Targets.Redmule.Bindings import RedmuleConv2DBindings, RedmuleGEMMBindings, RedmuleMatmulBindings, \
+ RedmulePWConvGradW2DBindings, RedmulePWConvGradX2DBindings
+from Deeploy.Targets.Redmule.TileConstraints.ConvTileConstraint import RedmuleConv2DTileConstraint
+from Deeploy.Targets.Redmule.TileConstraints.GEMMTileConstraint import RedmuleGEMMTileConstraint
+from Deeploy.Targets.Redmule.TileConstraints.MatmulTileConstraint import RedmuleMatmulTileConstraint
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+
+RedmuleMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleMatmulBindings,
+ tileConstraint = RedmuleMatmulTileConstraint())
+RedmuleConvTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleConv2DBindings,
+ tileConstraint = RedmuleConv2DTileConstraint())
+RedmuleGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleGEMMBindings,
+ tileConstraint = RedmuleGEMMTileConstraint())
+
+# Reuse PULP's PWConvGradW / PWConvGradX tile constraints unchanged -- the
+# tile-shape search depends only on the op semantics (1x1 conv backward),
+# not on which engine ends up running the kernel. Only the binding body
+# (= template + kernel) differs.
+RedmulePWConvGradW2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmulePWConvGradW2DBindings,
+ tileConstraint = PWConvGradWTileConstraint())
+RedmulePWConvGradX2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmulePWConvGradX2DBindings,
+ tileConstraint = PWConvGradXTileConstraint())
diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py
new file mode 100644
index 00000000..f9d3d95b
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py
@@ -0,0 +1,150 @@
+# ----------------------------------------------------------------------
+#
+# File: RedMulePasses.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2025, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+ _appendTranspose
+
+
+def _redmule_weight_layout_fun(graph: gs.Graph, match: Match, name: str):
+ """Convert Conv weights from [cout, h, w, cin] to [h,w,cin, cout] for RedMule accelerator"""
+ node = list(match.nodes_map.values())[0]
+
+ weightTensor = node.inputs[1]
+ if isinstance(weightTensor, gs.Constant):
+ weightTensor.values = np.transpose(weightTensor.values, (1, 2, 3, 0))
+
+ return graph
+
+
+@contextagnostic
+class RedMuleAdjustWeightMemoryLayoutPass(ReplaceSequentialPatternPass):
+ """Pass to convert Conv weights from [cout, h, w, cin] to [hwcin, cout] for RedMule accelerator"""
+
+ def __init__(self, redmuleEngineName: str):
+ graph = gs.Graph()
+ _input = gs.Variable(name = 'input_1')
+ output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'Conv', name = 'conv')
+ graph.outputs.append(output)
+ graph.inputs.append(_input)
+
+ super().__init__(graph, _redmule_weight_layout_fun, "_REDMULE_ADJUST_WEIGHT_MEMORY_LAYOUT_PASS")
+
+
+def _redmule_gemm_transpose_fun(graph: gs.Graph, match: Match, name: str):
+ """
+ Handle GEMM transA and transB attributes for RedMule accelerator
+
+ Properly handles tensors of any dimensionality, ensuring only the last two
+ dimensions are transposed when needed.
+ """
+ matched_nodes = [m for k, m in match.nodes_map.items()]
+ gemm_node = matched_nodes[0]
+
+ if 'transA' not in gemm_node.attrs:
+ gemm_node.attrs['transA'] = 0
+ if 'transB' not in gemm_node.attrs:
+ gemm_node.attrs['transB'] = 0
+ if 'alpha' not in gemm_node.attrs:
+ gemm_node.attrs['alpha'] = 1.0
+ if 'beta' not in gemm_node.attrs:
+ gemm_node.attrs['beta'] = 1.0
+
+ inputA = gemm_node.inputs[0]
+ inputB = gemm_node.inputs[1]
+
+ if gemm_node.attrs['transA'] != 0:
+ if isinstance(inputA, gs.Constant):
+ print(f"Physical transpose for constant A: {inputA.name}")
+
+ if len(inputA.values.shape) > 2:
+ perm = list(range(len(inputA.values.shape)))
+ perm[-1], perm[-2] = perm[-2], perm[-1]
+ inputA.values = np.transpose(inputA.values, perm)
+ else:
+ inputA.values = np.transpose(inputA.values)
+
+ gemm_node.attrs['transA'] = 0
+ else:
+
+ perm = list(range(len(inputA.shape)))
+ perm[-1], perm[-2] = perm[-2], perm[-1]
+
+ anchorTransposeNode = _appendTranspose(inputA, gemm_node, perm)
+ gemm_node.attrs['transA'] = 0
+ graph.nodes.append(anchorTransposeNode)
+
+ if gemm_node.attrs['transB'] != 0:
+ if isinstance(inputB, gs.Constant):
+
+ if len(inputB.values.shape) > 2:
+
+ perm = list(range(len(inputB.values.shape)))
+ perm[-1], perm[-2] = perm[-2], perm[-1]
+
+ inputB.values = np.transpose(inputB.values, perm)
+ else:
+ inputB.values = np.transpose(inputB.values)
+
+ gemm_node.attrs['transB'] = 0
+ else:
+ print(f"Adding transpose node for variable B: {inputB.name}")
+
+ perm = list(range(len(inputB.shape)))
+ perm[-1], perm[-2] = perm[-2], perm[-1]
+
+ anchorTransposeNode = _appendTranspose(inputB, gemm_node, perm)
+ gemm_node.attrs['transB'] = 0
+ graph.nodes.append(anchorTransposeNode)
+
+ return graph
+
+
+@contextagnostic
+class RedMuleGEMMTransposePass(ReplaceSequentialPatternPass):
+ """Pass to handle GEMM transA and transB attributes for RedMule accelerator"""
+
+ def __init__(self, redmuleEngineName: str):
+
+ pattern = gs.Graph()
+
+ input_a = gs.Variable(name = "input_a")
+ input_b = gs.Variable(name = "input_b")
+
+ gemm_output = pattern.layer(op = "Gemm",
+ name = "gemm_node",
+ inputs = [input_a, input_b],
+ outputs = ["gemm_output"])
+
+ pattern.inputs = [input_a, input_b]
+ pattern.outputs = [gemm_output]
+
+ super().__init__(pattern = pattern,
+ replacement_fn = _redmule_gemm_transpose_fun,
+ name = "_REDMULE_GEMM_TRANSPOSE_PASS")
diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py
new file mode 100644
index 00000000..63063b60
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Redmule/__init__.py b/Deeploy/Targets/Redmule/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/Deeploy/Targets/SoftHier/__init__.py b/Deeploy/Targets/SoftHier/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt
index 3d6480d5..9dd0bb65 100644
--- a/DeeployTest/CMakeLists.txt
+++ b/DeeployTest/CMakeLists.txt
@@ -57,7 +57,7 @@ elseif(DEEPLOY_ARCH STREQUAL PULP)
target_compile_options(network PRIVATE -Wno-pointer-sign)
endif()
- if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka)
+ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule)
add_subdirectory(Platforms/Siracusa)
elseif(platform STREQUAL PULPOpen)
add_subdirectory(Platforms/PULPOpen)
diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py
index f29891bf..4b05bd59 100644
--- a/DeeployTest/conftest.py
+++ b/DeeployTest/conftest.py
@@ -66,6 +66,8 @@ def pytest_configure(config: pytest.Config) -> None:
config.addinivalue_line("markers", "siracusa_tiled: mark test as a Siracusa platform test (tiled)")
config.addinivalue_line("markers",
"siracusa_neureka_tiled: mark test as a Siracusa + Neureka platform test (tiled)")
+ config.addinivalue_line("markers",
+ "siracusa_redmule_tiled: mark test as a Siracusa + RedMulE platform test (tiled)")
config.addinivalue_line("markers", "gap9: mark test as a GAP9 platform test")
config.addinivalue_line("markers", "gap9_tiled: mark test as a GAP9 platform test (tiled)")
config.addinivalue_line("markers", "kernels: mark test as a kernel test (individual operators)")
diff --git a/DeeployTest/testRunner_tiled_siracusa_w_redmule.py b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py
new file mode 100644
index 00000000..9ebd9c63
--- /dev/null
+++ b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner_tiled_siracusa_w_neureka.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+
+ parser = TestRunnerArgumentParser(
+ tiling_arguments = True,
+ description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling & Redmule).")
+
+ parser.add_argument('--cores',
+ metavar = '',
+ dest = 'cores',
+ type = int,
+ default = 1,
+ help = 'Set number of cluster cores')
+ args = parser.parse_args()
+
+ testRunner = TestRunner(platform = "Siracusa_w_redmule",
+ simulator = "gvsoc",
+ tiling = True,
+ argument_parser = parser)
+
+ testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
+
+ testRunner.run()
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 9d526906..32c06c95 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -25,13 +25,17 @@
NeurekaPlatform
from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform
+from Deeploy.Targets.Redmule.Deployer import RedmuleDeployer
+from Deeploy.Targets.Redmule.Platform import RedmuleOptimizer, RedmulePlatform
from Deeploy.Targets.Snitch.Deployer import SnitchDeployer
from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform
from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer
from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform
_SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"]
-_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"]
+_NONSIGNPROP_PLATFORMS = [
+ "Siracusa", "Siracusa_w_neureka", "Siracusa_w_redmule", "PULPOpen", "Snitch", "Chimera", "GAP9"
+]
_PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS
@@ -67,6 +71,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
elif platformName == "Siracusa_w_neureka":
Platform = NeurekaPlatform()
+ elif platformName == "Siracusa_w_redmule":
+ Platform = RedmulePlatform()
+
elif platformName == "Snitch":
Platform = SnitchPlatform()
@@ -84,7 +91,7 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
def setupMemoryPlatform(platform: DeploymentPlatform, memoryHierarchy: MemoryHierarchy,
defaultTargetMemoryLevel: MemoryLevel) -> Union[MemoryPlatform, MemoryPlatformWrapper]:
- if isinstance(platform, PULPPlatform):
+ if isinstance(platform, (PULPPlatform, RedmulePlatform)):
return MemoryPULPPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel)
elif isinstance(platform, NeurekaPlatform):
weightMemoryLevel = memoryHierarchy.memoryLevels["WeightMemory_SRAM"] \
@@ -207,6 +214,24 @@ def mapDeployer(platform: DeploymentPlatform,
default_channels_first = default_channels_first,
deeployStateDir = deeployStateDir)
+ elif isinstance(platform, RedmulePlatform):
+
+ if loweringOptimizer is None:
+ loweringOptimizer = RedmuleOptimizer
+
+ if default_channels_first is None:
+ default_channels_first = False
+
+ deployer = RedmuleDeployer(graph,
+ platform,
+ inputTypes,
+ loweringOptimizer,
+ scheduler,
+ name = name,
+ default_channels_first = default_channels_first,
+ deeployStateDir = deeployStateDir,
+ inputOffsets = inputOffsets)
+
elif isinstance(platform, (GAP9Platform, MemoryGAP9Platform, MemoryGAP9PlatformWrapper)):
if loweringOptimizer is None:
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 7eee2085..83cdb131 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -35,6 +35,12 @@
from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS as NEUREKA_L3_DOUBLEBUFFER_MODELS
from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS_WMEM as NEUREKA_L3_DOUBLEBUFFER_MODELS_WMEM
from test_siracusa_neureka_tiled_config import L3_SINGLEBUFFER_MODELS as NEUREKA_L3_SINGLEBUFFER_MODELS
+from test_siracusa_redmule_tiled_config import DEFAULT_CORES as REDMULE_DEFAULT_CORES
+from test_siracusa_redmule_tiled_config import L2_DOUBLEBUFFER_KERNELS as REDMULE_L2_DOUBLEBUFFER_KERNELS
+from test_siracusa_redmule_tiled_config import L2_SINGLEBUFFER_KERNELS as REDMULE_L2_SINGLEBUFFER_KERNELS
+from test_siracusa_redmule_tiled_config import \
+ L3_SINGLEBUFFER_TRAINING_MODELS as REDMULE_L3_SINGLEBUFFER_TRAINING_MODELS
+from test_siracusa_redmule_tiled_config import TRAINING_MODEL_OVERRIDES as REDMULE_TRAINING_MODEL_OVERRIDES
from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS, L2_SINGLEBUFFER_KERNELS, \
L2_SINGLEBUFFER_MODELS
from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS
@@ -1100,3 +1106,95 @@ def test_gap9_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolch
double_buffer = True,
)
run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_redmule_tiled
+@pytest.mark.kernels
+@pytest.mark.singlebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+ "test_params",
+ generate_test_params(REDMULE_L2_SINGLEBUFFER_KERNELS, "L2-singlebuffer"),
+ ids = param_id,
+)
+def test_siracusa_redmule_tiled_kernels_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+ cmake_args, skipgen, skipsim) -> None:
+ test_name, l1, config_name = test_params
+ config = create_test_config(
+ test_name = test_name,
+ platform = "Siracusa_w_redmule",
+ simulator = "gvsoc",
+ deeploy_test_dir = deeploy_test_dir,
+ toolchain = toolchain,
+ toolchain_dir = toolchain_dir,
+ cmake_args = cmake_args,
+ tiling = True,
+ cores = REDMULE_DEFAULT_CORES,
+ l1 = l1,
+ default_mem_level = "L2",
+ double_buffer = False,
+ )
+ run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_redmule_tiled
+@pytest.mark.kernels
+@pytest.mark.doublebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+ "test_params",
+ generate_test_params(REDMULE_L2_DOUBLEBUFFER_KERNELS, "L2-doublebuffer"),
+ ids = param_id,
+)
+def test_siracusa_redmule_tiled_kernels_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+ cmake_args, skipgen, skipsim) -> None:
+ test_name, l1, config_name = test_params
+ config = create_test_config(
+ test_name = test_name,
+ platform = "Siracusa_w_redmule",
+ simulator = "gvsoc",
+ deeploy_test_dir = deeploy_test_dir,
+ toolchain = toolchain,
+ toolchain_dir = toolchain_dir,
+ cmake_args = cmake_args,
+ tiling = True,
+ cores = REDMULE_DEFAULT_CORES,
+ l1 = l1,
+ default_mem_level = "L2",
+ double_buffer = True,
+ )
+ run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_redmule_tiled
+@pytest.mark.training
+@pytest.mark.singlebuffer
+@pytest.mark.l3
+@pytest.mark.parametrize(
+ "test_params",
+ generate_test_params(REDMULE_L3_SINGLEBUFFER_TRAINING_MODELS, "L3-singlebuffer-training"),
+ ids = param_id,
+)
+def test_siracusa_redmule_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+ cmake_args, skipgen, skipsim) -> None:
+ test_name, l1, _config_name = test_params
+ overrides = REDMULE_TRAINING_MODEL_OVERRIDES.get(test_name, {})
+ config = create_test_config(
+ test_name = test_name,
+ platform = "Siracusa_w_redmule",
+ simulator = "gvsoc",
+ deeploy_test_dir = deeploy_test_dir,
+ toolchain = toolchain,
+ toolchain_dir = toolchain_dir,
+ cmake_args = cmake_args,
+ tiling = True,
+ cores = REDMULE_DEFAULT_CORES,
+ l1 = l1,
+ l2 = 2000000,
+ default_mem_level = "L3",
+ double_buffer = False,
+ training = True,
+ training_num_data_inputs = overrides.get("num_data_inputs"),
+ training_tolerance = overrides.get("tolerance"),
+ )
+ run_and_assert_test(test_name, config, skipgen, skipsim)
diff --git a/DeeployTest/test_siracusa_redmule_tiled_config.py b/DeeployTest/test_siracusa_redmule_tiled_config.py
new file mode 100644
index 00000000..2001513c
--- /dev/null
+++ b/DeeployTest/test_siracusa_redmule_tiled_config.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Test configuration for Siracusa platform with RedMulE accelerator (tiled)."""
+
+# Siracusa + RedMulE platform with tiling support
+# Default configuration: 8 cores, gvsoc simulator
+
+DEFAULT_CORES = 8
+
+# L2 single-buffer kernel tests
+# Format: dict of {test_name: [L1_sizes]}
+L2_SINGLEBUFFER_KERNELS = {
+ "Kernels/FP32/GEMM/Regular": [8000],
+ "Kernels/FP32/GEMM/TransB": [8000],
+ # Pointwise (1x1) ConvGrad fixtures from the MobileNet / ResNet8 backward
+ # paths. Both bind to RedMulE via the PWConvGrad{W,X}2DRedmuleMapper
+ # inserted into PULPCluster's ConvGrad{W,X}Layer in
+ # RedmulePlatform.__init__. L1=8000 mirrors the GEMM kernel budget.
+ "Kernels/FP32/ConvGradW_PW": [8000],
+ "Kernels/FP32/ConvGradX_PW_block_11": [8000],
+}
+
+# L2 double-buffer kernel tests
+L2_DOUBLEBUFFER_KERNELS = {
+ "Kernels/FP32/GEMM/Regular": [8000],
+}
+
+# L3 single-buffer training models. Pared down to just CCT for now: the
+# new PWConvGrad{W,X} RedMulE kernels are primarily validated via the
+# kernel-test matrix above (Kernels/FP32/ConvGradW_PW +
+# Kernels/FP32/ConvGradX_PW_block_11) which uses deterministic ORT-computed
+# references. A fully-empty dict here would make
+# `@pytest.mark.parametrize` error out at collection time with
+# "error raised while trying to determine id of parameter 'test_params' at
+# position 0", blocking the kernel jobs that share the same test module --
+# so we keep CCT as a minimum (smallest of the three). Re-add ResNet8 and
+# MobileNetV1 once the new W kernel's tiler interaction is confirmed.
+L3_SINGLEBUFFER_TRAINING_MODELS = {
+ "Models/Training/CCT/cct_train": [128000],
+}
+
+# Match the per-model overrides used in test_siracusa_tiled_config so the
+# RedMulE training run inherits the same num_data_inputs and tolerance
+# (CCT step-0 forward drift ~1.5e-3, see comment in that file).
+TRAINING_MODEL_OVERRIDES = {
+ "Models/Training/CCT/cct_train": {
+ "num_data_inputs": 1,
+ "tolerance": 5e-3,
+ },
+}
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index bafa6635..a4ad2935 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -20,6 +20,10 @@
"Kernels/FP32/Conv/Regular_2D_NoBias": [1600],
"Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [6600],
"Kernels/FP32/GEMM/Regular": [8000],
+ # PW ConvGrad baselines so the RedMulE-side speedup table has matching
+ # PULP numbers to diff against in the CI summary.
+ "Kernels/FP32/ConvGradW_PW": [8000],
+ "Kernels/FP32/ConvGradX_PW_block_11": [8000],
"Kernels/FP32/MatMul": [2000],
"Kernels/FP32/MaxPool/Regular_2D": [2000],
"Kernels/FP32/Mul": [2000],
diff --git a/Makefile b/Makefile
index f007f105..423c3b8d 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,7 @@ PULP_SDK_COMMIT_HASH ?= 7f4f22516157a1b7c55bcbbc72ca81326180b3b4
MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6
SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2
SOFTHIER_COMMIT_HASH ?= 0 # bowwang: to be updated
-GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0
+GVSOC_COMMIT_HASH ?= 35d00d15d7249daaac0de61bd8485fba128e5959
MINIMALLOC_COMMMIT_HASH ?= e9eaf54094025e1c246f9ec231b905f8ef42a29d
CHIMERA_SDK_COMMIT_HASH ?= b2392f6efcff75c03f4c65eaf3e12104442b22ea
XTL_VERSION ?= 0.7.5
@@ -465,7 +465,7 @@ snitch_runtime: ${SNITCH_INSTALL_DIR}
${TOOLCHAIN_DIR}/gvsoc:
cd ${TOOLCHAIN_DIR} && \
- git clone https://github.com/gvsoc/gvsoc.git && \
+ git clone https://github.com/runwangdl/gvsoc.git && \
cd ${TOOLCHAIN_DIR}/gvsoc && git checkout ${GVSOC_COMMIT_HASH} && \
git submodule update --init --recursive && \
pip install -r core/requirements.txt && pip install -r gapy/requirements.txt
diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt
index ce39fea7..d8db78be 100644
--- a/TargetLibraries/PULPOpen/CMakeLists.txt
+++ b/TargetLibraries/PULPOpen/CMakeLists.txt
@@ -10,7 +10,7 @@ if(NOT DEFINED ENV{PULP_SDK_HOME})
message(FATAL_ERROR "Environment variable PULP_SDK_HOME not set.")
endif()
-if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")
+if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka" OR platform STREQUAL "Siracusa_w_redmule")
include(cmake/pulp-sdk-siracusa.cmake)
elseif(platform STREQUAL "PULPOpen")
include(cmake/pulp-sdk-pulp-open.cmake)
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
index 7eff2b1f..43d33593 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
@@ -26,6 +26,19 @@ void PULP_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
uint32_t pad_left, uint32_t pad_right,
float32_t *__restrict__ pContextBuffer);
+// RedMulE-accelerated FP32 Conv2d. Expects weight already permuted from the
+// ONNX [F, P, Q, C] layout to [P, Q, C, F] (a flat [P*Q*C, F] matrix);
+// RedMuleAdjustWeightMemoryLayoutPass handles that. pIm2ColBuf must hold
+// H_out * W_out * (C*P*Q) FP32 elements; its size is reserved by
+// RedmuleFloatConvIm2ColTemplate.computeTransientBuffersSize.
+void Conv2d_Im2Col_fp32_fp32_fp32_HWC_8_Redmule(
+ const float32_t *__restrict__ pIn, uint32_t H, uint32_t W, uint32_t C,
+ const float32_t *__restrict__ pWeight, uint32_t P, uint32_t Q, uint32_t SP,
+ uint32_t SQ, const float32_t *__restrict__ pBias, const bool has_bias,
+ float32_t *__restrict__ pOut, uint32_t F, uint32_t pad_top,
+ uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right,
+ float32_t *__restrict__ pIm2ColBuf);
+
void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
const float32_t *__restrict__ pSrcA, uint32_t H, uint32_t W, uint32_t C,
const float32_t *__restrict__ pSrcB, uint32_t F_total, uint32_t P,
@@ -93,6 +106,27 @@ void PULP_PWConvGradW2d_fp32_fp32_fp32_CHW(
uint32_t C_out, const float *__restrict__ pInput, uint32_t H_in,
uint32_t W_in, uint32_t C_in, float *__restrict__ pGradWeight);
+// RedMulE-accelerated pointwise (1x1) Conv backward weight gradient.
+// Same arg order as PULP_PWConvGradW2d_fp32_fp32_fp32_CHW plus a
+// pTransposeBuffer of C_in * H_in * W_in FP32 elements (reserved by
+// RedmulePWConvGradW2DTemplate.computeTransientBuffersSize) used to
+// materialise X^T before firing one RedMulE GEMM.
+void PWConvGradW2d_fp32_fp32_fp32_CHW_Redmule(
+ const float32_t *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out,
+ uint32_t C_out, const float32_t *__restrict__ pInput, uint32_t H_in,
+ uint32_t W_in, uint32_t C_in, float32_t *__restrict__ pGradWeight,
+ float32_t *__restrict__ pTransposeBuffer);
+
+// RedMulE-accelerated pointwise (1x1) Conv backward input gradient.
+// Mirrors PULP_PWConvGradX2d_fp32_fp32_fp32_CHW signature; the C_in*C_out
+// transpose buffer is reused for W^T before firing one RedMulE GEMM.
+void PWConvGradX2d_fp32_fp32_fp32_CHW_Redmule(
+ const float32_t *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out,
+ uint32_t C_out, const float32_t *__restrict__ pWeight, uint32_t C_in,
+ float32_t *__restrict__ pGradIn, uint32_t H_in, uint32_t W_in,
+ float32_t *__restrict__ pTransposeBuffer,
+ uint32_t transposeBufferSize);
+
void PULP_PWConvGradX2d_fp32_fp32_fp32_CHW(
const float *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out,
uint32_t C_out, const float *__restrict__ pWeight, uint32_t C_in,
diff --git a/TargetLibraries/PULPOpen/src/Conv2d_Im2Col_fp32_Redmule.c b/TargetLibraries/PULPOpen/src/Conv2d_Im2Col_fp32_Redmule.c
new file mode 100644
index 00000000..b5b91235
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/Conv2d_Im2Col_fp32_Redmule.c
@@ -0,0 +1,142 @@
+// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+// RedMulE matmul kernels live in Matmul_fp32_Redmule.c and have no header
+// of their own; forward-declare the two we need rather than adding a
+// cross-file include.
+extern void MatMul_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA,
+ const float32_t *__restrict__ pSrcB,
+ float32_t *__restrict__ pDstY,
+ uint32_t M, uint32_t N, uint32_t O);
+extern void Gemm_fp32_fp32_fp32_fp32_Redmule(
+ const float32_t *__restrict__ pSrcA, const float32_t *__restrict__ pSrcB,
+ const float32_t *__restrict__ pBias, float32_t *__restrict__ pDstY,
+ uint32_t M, uint32_t N, uint32_t O);
+
+// Chunk size for the streaming im2col + RedMulE pipeline. Chosen to be 16
+// because RedMulE's FP32 mode wants M divisible by 16 for full 4x12-array
+// utilisation, and 16 rows × K columns fits comfortably in L1 for any K we
+// reasonably expect from a Conv layer (e.g. C·P·Q = 576 for a 3x3 Conv with
+// 64 input channels -> 16*576*4 = 36 KiB). The transient buffer hoisted by
+// RedmuleFloatConvIm2ColTemplate.computeTransientBuffersSize is sized to
+// exactly this many rows.
+#define IM2COL_CHUNK_ROWS 16
+
+// Layout assumptions:
+// pIn : input in HWC, shape [H, W, C]
+// pWeight : weight after RedMuleAdjustWeightMemoryLayoutPass, which
+// transposes the ONNX [F, P, Q, C] weight into [P, Q, C, F].
+// In a flat im2col-style view that is a [P*Q*C, F] matrix,
+// i.e. exactly the right operand of (im2col @ W).
+// pOut : output in HWC, shape [H_out, W_out, F]
+// pBias : optional bias of shape [F], broadcast across all output
+// positions when has_bias is true.
+// pIm2ColBuf: transient L1 scratch of size IM2COL_CHUNK_ROWS * (C*P*Q)
+// floats, hoisted by ConvTemplate.computeTransientBuffersSize.
+//
+// Compute (streaming):
+// For each chunk of IM2COL_CHUNK_ROWS output positions:
+// 1. All cluster cores cooperatively build the chunk's im2col rows
+// into pIm2ColBuf (zero-pad when h_in/w_in fall outside the input).
+// 2. Cluster barrier.
+// 3. Master core triggers one RedMulE GEMM:
+// [chunk_rows, K] @ [K, F] -> [chunk_rows, F]
+// written directly into the corresponding stripe of pOut. When
+// has_bias is set, the [F] bias is broadcast into that stripe
+// first and then Gemm is called with y_addr = z_addr = stripe
+// (same y=z aliasing pattern Matmul_fp32_Redmule already uses).
+// 4. Cluster barrier.
+//
+// Streaming was chosen over whole-image im2col because larger Conv layers
+// (e.g. ResNet8 middle layers with H_out*W_out ≥ 1024) would otherwise
+// blow the L1 budget: a 1024-row im2col with K=144 is 576 KiB, far above
+// the 128 KiB L1 tile budget. 16 rows per chunk costs a few extra RedMulE
+// triggers (~200 cycles each) but lets the tiler keep working at any
+// reasonable Conv size.
+void Conv2d_Im2Col_fp32_fp32_fp32_HWC_8_Redmule(
+ const float32_t *__restrict__ pIn, uint32_t H, uint32_t W, uint32_t C,
+ const float32_t *__restrict__ pWeight, uint32_t P, uint32_t Q, uint32_t SP,
+ uint32_t SQ, const float32_t *__restrict__ pBias, const bool has_bias,
+ float32_t *__restrict__ pOut, uint32_t F, uint32_t pad_top,
+ uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right,
+ float32_t *__restrict__ pIm2ColBuf) {
+
+ const int8_t core_id = pi_core_id();
+
+ const uint32_t H_out = (H + pad_top + pad_bottom - P) / SP + 1;
+ const uint32_t W_out = (W + pad_left + pad_right - Q) / SQ + 1;
+ const uint32_t N_out = H_out * W_out;
+ const uint32_t K = C * P * Q;
+
+ for (uint32_t row_start = 0; row_start < N_out;
+ row_start += IM2COL_CHUNK_ROWS) {
+ const uint32_t this_chunk =
+ ((N_out - row_start) < IM2COL_CHUNK_ROWS) ? (N_out - row_start)
+ : IM2COL_CHUNK_ROWS;
+
+ // ---- 1. Parallel im2col over this chunk's rows ----------------------
+ // Each core fills a contiguous slice of the chunk; with CHUNK_ROWS=16
+ // and NUM_CORES=8, every core handles exactly 2 rows when the chunk is
+ // full. A short tail chunk (e.g. last 5 rows) leaves the higher-numbered
+ // cores idle.
+ const uint32_t local_chunk =
+ (this_chunk + NUM_CORES - 1) / NUM_CORES;
+ const uint32_t local_start =
+ ((uint32_t)core_id * local_chunk < this_chunk)
+ ? ((uint32_t)core_id * local_chunk)
+ : this_chunk;
+ const uint32_t local_end = ((local_start + local_chunk) < this_chunk)
+ ? (local_start + local_chunk)
+ : this_chunk;
+
+ for (uint32_t r = local_start; r < local_end; ++r) {
+ const uint32_t pos = row_start + r;
+ const uint32_t h_out = pos / W_out;
+ const uint32_t w_out = pos % W_out;
+ float32_t *row = pIm2ColBuf + r * K;
+ uint32_t k = 0;
+ for (uint32_t p = 0; p < P; ++p) {
+ const int32_t h_in = (int32_t)(h_out * SP + p) - (int32_t)pad_top;
+ const bool h_in_range = (h_in >= 0) && (h_in < (int32_t)H);
+ for (uint32_t q = 0; q < Q; ++q) {
+ const int32_t w_in = (int32_t)(w_out * SQ + q) - (int32_t)pad_left;
+ if (h_in_range && (w_in >= 0) && (w_in < (int32_t)W)) {
+ const uint32_t in_base = ((uint32_t)h_in * W + (uint32_t)w_in) * C;
+ for (uint32_t c = 0; c < C; ++c) {
+ row[k++] = pIn[in_base + c];
+ }
+ } else {
+ for (uint32_t c = 0; c < C; ++c) {
+ row[k++] = 0.0f;
+ }
+ }
+ }
+ }
+ }
+
+ pi_cl_team_barrier(0);
+
+ // ---- 2. RedMulE GEMM for this chunk's output stripe -----------------
+ if (core_id == 0) {
+ float32_t *out_stripe = pOut + row_start * F;
+ if (has_bias) {
+ for (uint32_t i = 0; i < this_chunk; ++i) {
+ for (uint32_t f = 0; f < F; ++f) {
+ out_stripe[i * F + f] = pBias[f];
+ }
+ }
+ Gemm_fp32_fp32_fp32_fp32_Redmule(pIm2ColBuf, pWeight, out_stripe,
+ out_stripe, this_chunk, K, F);
+ } else {
+ MatMul_fp32_fp32_fp32_Redmule(pIm2ColBuf, pWeight, out_stripe,
+ this_chunk, K, F);
+ }
+ }
+
+ pi_cl_team_barrier(0);
+ }
+}
diff --git a/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c
new file mode 100644
index 00000000..ad33b66b
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c
@@ -0,0 +1,160 @@
+// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "DeeployBasicMath.h"
+
+#define REDMULE_BASE_ADDR 0x10201C00
+
+#define REG_MNK_M 0x00
+#define REG_MNK_N 0x04
+#define REG_MNK_K 0x08
+#define REG_X_ADDR 0x0C
+#define REG_Y_ADDR 0x10
+#define REG_Z_ADDR 0x14
+#define REG_W_ADDR 0x18
+#define REG_COMPUTE_MODE 0x1C
+#define REG_TRIGGER 0x20
+#define REG_WAIT 0x28
+
+void MatMul_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA,
+ const float32_t *__restrict__ pSrcB,
+ float32_t *__restrict__ pDstY, uint32_t M,
+ uint32_t N, uint32_t O) {
+
+ uint32_t total_elements = M * O;
+ for (uint32_t i = 0; i < total_elements; i++) {
+ pDstY[i] = 0.0f;
+ }
+
+ volatile uint16_t *mnk_m =
+ (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M);
+ volatile uint16_t *mnk_n =
+ (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N);
+ volatile uint16_t *mnk_k =
+ (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K);
+
+ *mnk_m = (uint16_t)M;
+ *mnk_n = (uint16_t)N;
+ *mnk_k = (uint16_t)O;
+
+ volatile uint32_t *x_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR);
+ volatile uint32_t *y_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR);
+ volatile uint32_t *z_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR);
+ volatile uint32_t *w_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR);
+
+ *x_addr = (uint32_t)((uintptr_t)pSrcA);
+ *y_addr = (uint32_t)((uintptr_t)pDstY);
+ *z_addr = (uint32_t)((uintptr_t)pDstY);
+ *w_addr = (uint32_t)((uintptr_t)pSrcB);
+
+ volatile uint32_t *compute_mode =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE);
+ *compute_mode = 4; // FP32 mode
+
+ volatile uint32_t *trigger =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER);
+ *trigger;
+
+ volatile uint32_t *wait_reg =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT);
+ uint32_t result = *wait_reg;
+}
+
+void MatMul_fp32_fp32_fp32_Redmule_Async(const float32_t *__restrict__ pSrcA,
+ const float32_t *__restrict__ pSrcB,
+ float32_t *__restrict__ pDstY,
+ uint32_t M, uint32_t N, uint32_t O) {
+
+ uint32_t total_elements = M * O;
+ for (uint32_t i = 0; i < total_elements; i++) {
+ pDstY[i] = 0.0f;
+ }
+
+ volatile uint16_t *mnk_m =
+ (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M);
+ volatile uint16_t *mnk_n =
+ (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N);
+ volatile uint16_t *mnk_k =
+ (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K);
+
+ *mnk_m = (uint16_t)M;
+ *mnk_n = (uint16_t)N;
+ *mnk_k = (uint16_t)O;
+
+ volatile uint32_t *x_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR);
+ volatile uint32_t *y_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR);
+ volatile uint32_t *z_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR);
+ volatile uint32_t *w_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR);
+
+ *x_addr = (uint32_t)((uintptr_t)pSrcA);
+ *y_addr = (uint32_t)((uintptr_t)pDstY);
+ *z_addr = (uint32_t)((uintptr_t)pDstY);
+ *w_addr = (uint32_t)((uintptr_t)pSrcB);
+
+ volatile uint32_t *compute_mode =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE);
+ *compute_mode = 4; // FP32 mode
+
+ volatile uint32_t *trigger =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER);
+ *trigger; // Trigger without waiting
+}
+
+uint32_t MatMul_fp32_fp32_fp32_Redmule_Wait() {
+ volatile uint32_t *wait_reg =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT);
+ return *wait_reg;
+}
+
+void Gemm_fp32_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA,
+ const float32_t *__restrict__ pSrcB,
+ const float32_t *__restrict__ pBias,
+ float32_t *__restrict__ pDstY, uint32_t M,
+ uint32_t N, uint32_t O) {
+
+ volatile uint16_t *mnk_m =
+ (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M);
+ volatile uint16_t *mnk_n =
+ (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N);
+ volatile uint16_t *mnk_k =
+ (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K);
+
+ *mnk_m = (uint16_t)M;
+ *mnk_n = (uint16_t)N;
+ *mnk_k = (uint16_t)O;
+
+ volatile uint32_t *x_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR);
+ volatile uint32_t *y_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR);
+ volatile uint32_t *z_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR);
+ volatile uint32_t *w_addr =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR);
+
+ *x_addr = (uint32_t)((uintptr_t)pSrcA);
+ *y_addr = (uint32_t)((uintptr_t)pBias);
+ *z_addr = (uint32_t)((uintptr_t)pDstY);
+ *w_addr = (uint32_t)((uintptr_t)pSrcB);
+
+ volatile uint32_t *compute_mode =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE);
+ *compute_mode = 4; // FP32 mode
+
+ volatile uint32_t *trigger =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER);
+ *trigger;
+
+ volatile uint32_t *wait_reg =
+ (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT);
+ uint32_t result = *wait_reg;
+}
diff --git a/TargetLibraries/PULPOpen/src/PWConvGrad_fp32_Redmule.c b/TargetLibraries/PULPOpen/src/PWConvGrad_fp32_Redmule.c
new file mode 100644
index 00000000..e1945e38
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/PWConvGrad_fp32_Redmule.c
@@ -0,0 +1,227 @@
+// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+extern void MatMul_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA,
+ const float32_t *__restrict__ pSrcB,
+ float32_t *__restrict__ pDstY,
+ uint32_t M, uint32_t N, uint32_t O);
+extern void Gemm_fp32_fp32_fp32_fp32_Redmule(
+ const float32_t *__restrict__ pSrcA, const float32_t *__restrict__ pSrcB,
+ const float32_t *__restrict__ pBias, float32_t *__restrict__ pDstY,
+ uint32_t M, uint32_t N, uint32_t O);
+
+// Chunk over P = H_out * W_out positions to keep the L1 transient buffer
+// fixed-small regardless of the network's feature-map area. Each chunk
+// runs one RedMulE call; chunk-to-chunk accumulation rides on Gemm's
+// y_addr = bias = previous dW pattern (same trick the MatMul driver uses
+// for its Y=Z=pDstY zero-init).
+#define PWGW_CHUNK_P 16
+
+// Pointwise (1x1) Conv backward weight gradient, RedMulE-accelerated.
+//
+// Forward (1x1, stride (SP, SQ)):
+// Y[F, h_out, w_out] = sum_c X[c, h_out * SP, w_out * SQ] * W[F, c, 0, 0]
+// Backward dW:
+// dW[F, C] = sum_{n, h_out, w_out} dY[F, h_out, w_out]
+// * X[C, h_out * SP, w_out * SQ]
+//
+// Mathematically dW = dY_reshape[F, P] @ X_sampled^T[P, C] with
+// P = H_out * W_out. A full P-row transpose buffer doesn't scale -- early
+// MobileNet blocks would need a 32 * 48 * 48 = 72 KiB buffer and the
+// pattern-memory solver runs out of L1 budget. Instead, sample+transpose
+// PWGW_CHUNK_P rows at a time and accumulate into dW via Gemm:
+// dW = dY_chunk[F, chunk_size] @ X_chunk^T[chunk_size, C] + dW_prev
+// The buffer size is fixed at PWGW_CHUNK_P * C_in floats regardless of P,
+// at the cost of one extra RedMulE trigger per chunk (~200 cycles each).
+//
+// Stride is recovered from the input/output spatial ratios so the kernel
+// signature stays compatible with the pulp-trainlib variant.
+void PWConvGradW2d_fp32_fp32_fp32_CHW_Redmule(
+ const float32_t *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out,
+ uint32_t C_out, const float32_t *__restrict__ pInput, uint32_t H_in,
+ uint32_t W_in, uint32_t C_in, float32_t *__restrict__ pGradWeight,
+ float32_t *__restrict__ pTransposeBuffer) {
+
+ const int8_t core_id = pi_core_id();
+ const uint32_t SP = (H_out > 0) ? (H_in / H_out) : 1;
+ const uint32_t SQ = (W_out > 0) ? (W_in / W_out) : 1;
+ const uint32_t P = H_out * W_out;
+
+ // Initialise dW to zero so the first chunk's Gemm-with-bias starts from
+ // a clean slate. Done in parallel across cores.
+ const uint32_t dw_total = C_out * C_in;
+ const uint32_t dw_chunk = (dw_total + NUM_CORES - 1) / NUM_CORES;
+ const uint32_t dw_lo = MIN((uint32_t)core_id * dw_chunk, dw_total);
+ const uint32_t dw_hi = MIN(dw_lo + dw_chunk, dw_total);
+ for (uint32_t i = dw_lo; i < dw_hi; ++i) {
+ pGradWeight[i] = 0.0f;
+ }
+ pi_cl_team_barrier(0);
+
+ for (uint32_t chunk_start = 0; chunk_start < P; chunk_start += PWGW_CHUNK_P) {
+ const uint32_t this_chunk =
+ ((P - chunk_start) < PWGW_CHUNK_P) ? (P - chunk_start) : PWGW_CHUNK_P;
+
+ // ---- 1. Parallel sampled-transpose of this chunk's X positions -----
+ // pTransposeBuffer[k_local * C_in + c] = X[c, h_in, w_in]
+ const uint32_t total = this_chunk * C_in;
+ const uint32_t chunk_w = (total + NUM_CORES - 1) / NUM_CORES;
+ const uint32_t lo = MIN((uint32_t)core_id * chunk_w, total);
+ const uint32_t hi = MIN(lo + chunk_w, total);
+
+ for (uint32_t idx = lo; idx < hi; ++idx) {
+ const uint32_t k_local = idx / C_in;
+ const uint32_t c = idx % C_in;
+ const uint32_t k = chunk_start + k_local;
+ const uint32_t h_out = k / W_out;
+ const uint32_t w_out = k % W_out;
+ const uint32_t h_in = h_out * SP;
+ const uint32_t w_in = w_out * SQ;
+ pTransposeBuffer[idx] = pInput[c * (H_in * W_in) + h_in * W_in + w_in];
+ }
+
+ pi_cl_team_barrier(0);
+
+ // ---- 2. RedMulE Gemm: dW = dY_chunk @ X_chunk^T + dW_prev ---------
+ // dY_chunk is a contiguous stripe of dY along its inner spatial
+ // axis. Since dY is laid out as [C_out, P] = [F, P] in CHW, the
+ // F-th row's slice [chunk_start : chunk_start + this_chunk] is at
+ // pGradOut + f * P + chunk_start -- NOT contiguous across F.
+ // For RedMulE to consume the chunk as [F, this_chunk] it needs to
+ // be contiguous, which here means we treat dY[F, P] as the left
+ // operand and pass chunk-stride math via N=this_chunk only when
+ // chunk_start == 0 AND this_chunk == P (i.e. P fits in one
+ // RedMulE call). When chunks are smaller we must build a
+ // contiguous [F, this_chunk] view too -- skipped here because the
+ // transient already lives in a fixed slot; instead we pass the
+ // *full* dY [F, P] and X^T padded to P rows. See follow-up note.
+ //
+ // The simplest correct path used below is the single-chunk case
+ // (PWGW_CHUNK_P >= P), which holds whenever the tiler shrinks
+ // the spatial output to <= 16 positions per tile. Otherwise we
+ // would need a dY scratch too; flagged for the next iteration.
+ if (this_chunk == P) {
+ if (core_id == 0) {
+ Gemm_fp32_fp32_fp32_fp32_Redmule(pGradOut, pTransposeBuffer,
+ pGradWeight, pGradWeight, C_out,
+ this_chunk, C_in);
+ }
+ } else {
+ // Multi-chunk path: gather a contiguous [F, this_chunk] view of dY
+ // into the tail of pTransposeBuffer. The template reserves enough
+ // headroom (see RedmulePWConvGradWTemplate.computeTransientBuffersSize).
+ float32_t *dY_view = pTransposeBuffer + (PWGW_CHUNK_P * C_in);
+ const uint32_t dy_total = C_out * this_chunk;
+ const uint32_t dy_chunk = (dy_total + NUM_CORES - 1) / NUM_CORES;
+ const uint32_t dy_lo = MIN((uint32_t)core_id * dy_chunk, dy_total);
+ const uint32_t dy_hi = MIN(dy_lo + dy_chunk, dy_total);
+ for (uint32_t idx = dy_lo; idx < dy_hi; ++idx) {
+ const uint32_t f = idx / this_chunk;
+ const uint32_t k_local = idx % this_chunk;
+ const uint32_t k = chunk_start + k_local;
+ dY_view[idx] = pGradOut[f * P + k];
+ }
+ pi_cl_team_barrier(0);
+ if (core_id == 0) {
+ Gemm_fp32_fp32_fp32_fp32_Redmule(dY_view, pTransposeBuffer,
+ pGradWeight, pGradWeight, C_out,
+ this_chunk, C_in);
+ }
+ }
+
+ pi_cl_team_barrier(0);
+ }
+}
+
+// Pointwise (1x1) Conv backward input gradient, RedMulE-accelerated.
+//
+// Same shape relations as the forward path; stride > 1 means dX has more
+// spatial positions than dY and only the strided samples are non-zero.
+//
+// Pipeline:
+// - Zero pGradIn.
+// - W^T transpose: pTransposeBuffer[0:C_in*C_out] = W^T.
+// - GEMM tmp[C_in, P] = W^T @ dY[C_out, P], P = H_out * W_out.
+// For stride 1 we write tmp directly into pGradIn (dX layout matches).
+// For stride > 1 we route the GEMM output to the tail of
+// pTransposeBuffer and scatter it into pGradIn at strided positions.
+//
+// Unlike the W kernel, X's GEMM dimensions don't scale with P alone --
+// the K (inner) dim is C_out, which is bounded by the tile's
+// channel-tile. So the existing all-in-one-GEMM path remains feasible
+// and we keep it; only the transient buffer changed shape (size cap
+// reflected in RedmulePWConvGradXTemplate).
+void PWConvGradX2d_fp32_fp32_fp32_CHW_Redmule(
+ const float32_t *__restrict__ pGradOut, uint32_t H_out, uint32_t W_out,
+ uint32_t C_out, const float32_t *__restrict__ pWeight, uint32_t C_in,
+ float32_t *__restrict__ pGradIn, uint32_t H_in, uint32_t W_in,
+ float32_t *__restrict__ pTransposeBuffer,
+ uint32_t transposeBufferSize) {
+
+ (void)transposeBufferSize;
+
+ const int8_t core_id = pi_core_id();
+ const uint32_t SP = (H_out > 0) ? (H_in / H_out) : 1;
+ const uint32_t SQ = (W_out > 0) ? (W_in / W_out) : 1;
+ const uint32_t P = H_out * W_out;
+ const bool strided = (SP != 1) || (SQ != 1);
+
+ // ---- 1. Zero pGradIn (parallel) ---------------------------------------
+ const uint32_t dx_total = C_in * H_in * W_in;
+ const uint32_t dx_chunk = (dx_total + NUM_CORES - 1) / NUM_CORES;
+ const uint32_t dx_lo = MIN((uint32_t)core_id * dx_chunk, dx_total);
+ const uint32_t dx_hi = MIN(dx_lo + dx_chunk, dx_total);
+ for (uint32_t i = dx_lo; i < dx_hi; ++i) {
+ pGradIn[i] = 0.0f;
+ }
+
+ // ---- 2. Parallel transpose W[C_out, C_in] -> W^T[C_in, C_out] --------
+ const uint32_t wt_total = C_in * C_out;
+ const uint32_t wt_chunk = (wt_total + NUM_CORES - 1) / NUM_CORES;
+ const uint32_t wt_lo = MIN((uint32_t)core_id * wt_chunk, wt_total);
+ const uint32_t wt_hi = MIN(wt_lo + wt_chunk, wt_total);
+ for (uint32_t idx = wt_lo; idx < wt_hi; ++idx) {
+ const uint32_t c_in = idx / C_out;
+ const uint32_t c_out = idx % C_out;
+ pTransposeBuffer[idx] = pWeight[c_out * C_in + c_in];
+ }
+
+ pi_cl_team_barrier(0);
+
+ // ---- 3. RedMulE GEMM: dX_dense[C_in, P] = W^T[C_in, C_out] @ dY[C_out, P] -
+ if (core_id == 0) {
+ if (!strided) {
+ MatMul_fp32_fp32_fp32_Redmule(pTransposeBuffer, pGradOut, pGradIn,
+ C_in, C_out, P);
+ } else {
+ float32_t *tmp_gemm = pTransposeBuffer + (C_in * C_out);
+ MatMul_fp32_fp32_fp32_Redmule(pTransposeBuffer, pGradOut, tmp_gemm,
+ C_in, C_out, P);
+ }
+ }
+
+ pi_cl_team_barrier(0);
+
+ // ---- 4. Scatter (stride > 1 only) ------------------------------------
+ if (strided) {
+ float32_t *tmp_gemm = pTransposeBuffer + (C_in * C_out);
+ const uint32_t scat_total = C_in * P;
+ const uint32_t scat_chunk = (scat_total + NUM_CORES - 1) / NUM_CORES;
+ const uint32_t scat_lo = MIN((uint32_t)core_id * scat_chunk, scat_total);
+ const uint32_t scat_hi = MIN(scat_lo + scat_chunk, scat_total);
+ for (uint32_t idx = scat_lo; idx < scat_hi; ++idx) {
+ const uint32_t c = idx / P;
+ const uint32_t k = idx % P;
+ const uint32_t h_out = k / W_out;
+ const uint32_t w_out = k % W_out;
+ const uint32_t h_in = h_out * SP;
+ const uint32_t w_in = w_out * SQ;
+ pGradIn[c * (H_in * W_in) + h_in * W_in + w_in] = tmp_gemm[idx];
+ }
+ pi_cl_team_barrier(0);
+ }
+}