From defb9461f3c3a9a1b0c0c0323c10a8f57bee0c0e Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Wed, 22 Oct 2025 10:48:22 +0200
Subject: [PATCH 1/6] Annotate execution blocks with transfer information

Instead of calculating the transfer information in the
wrapTilingSolution function every time for each memory level, do it once
in the TilerExtension and annotate the execution block with it like with
the pattern. I'm not fully satisfied with the approach, but it's a step
in the right direction.
---
 Deeploy/DeeployTypes.py                       |  1 +
 .../TilingCodeGeneration.py                   | 10 ++-
 .../TilingVariableReplacement.py              | 19 ++++-
 Deeploy/TilingExtension/TileConstraint.py     | 83 ++-----------------
 Deeploy/TilingExtension/TilerExtension.py     | 30 +++++++
 Deeploy/TilingExtension/TilingCodegen.py      | 26 +++---
 6 files changed, 72 insertions(+), 97 deletions(-)

diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index e6ca25c9bd..a339e3afe1 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -1458,6 +1458,7 @@ def __init__(self, operatorCodeSnippet: Optional[CodeSnippet] = None):
             )  #: Sequence[CodeSnippet]: ordered list of code snippets that need to be generated to implemented the associated operator
 
         self.patternMemoryConstraint: Optional = None  #: Optional[PatternMemoryConstraint]: Tiling information of the operator which is annotated in the midend
+        self.transfers: Optional = None  #: Optional[Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]]]: Tiling transfers
 
     def addLeft(self, template: NodeTemplate, operatorRepresentation: OperatorRepresentation):
         """Adds a code snippet that is generated BEFORE any of the other code snippets in this ExecutionBlock
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
index 0db3109aea..11ba8f0a40 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
@@ -19,6 +19,7 @@
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import TilingHoistingMixIn
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PrototypeTilingMixIn
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme, \
     calculateFlatOffset, minimizeRectangle, minimizeVariableReplacement, padOffset, padShape, stridesFromShape
 
@@ -241,8 +242,13 @@ def apply(self,
                 assert isinstance(buffer, VariableBuffer)
                 unraveledOpRepr[key] = ctxt.unravelReference(buffer).name
 
-        variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution(
-            nodeMemoryConstraint, self.localMemory, ctxt, unraveledOpRepr)
+        tileConstraint: TileConstraint = template.tileConstraint
+        transfers = {
+            tensorName: memTransfers[self.localMemory]
+            for tensorName, memTransfers in baseExecutionBlock.transfers.items()
+        }
+        variableReplacement, tilingSchedules = tileConstraint.wrapTilingSolution(nodeMemoryConstraint, self.localMemory,
+                                                                                 ctxt, unraveledOpRepr, transfers)
 
         minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
 
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
index 76eacd10dd..803f0805d6 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
@@ -15,6 +15,7 @@
     _ReferenceBuffer
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingHoistingMixIn import TilingHoistingMixIn
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerExtension import Tiler
 from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, minimizeVariableReplacement
 
@@ -133,8 +134,13 @@ def apply(self,
             for key, value in operatorRepresentation.items()
         }
 
-        variableReplacement, tilingSchedules = template.tileConstraint.wrapTilingSolution(
-            nodeMemoryConstraint, self.targetMemLevel, ctxt, unraveledOpRepr)
+        tileConstr: TileConstraint = template.tileConstraint
+        transfers = {
+            tensorName: memTransfers[self.targetMemLevel]
+            for tensorName, memTransfers in baseExecutionBlock.transfers.items()
+        }
+        variableReplacement, tilingSchedules = tileConstr.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel,
+                                                                             ctxt, unraveledOpRepr, transfers)
 
         minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
         operatorRepresentation.update(newOpRepr)
@@ -233,8 +239,13 @@ def apply(self,
             for key, value in operatorRepresentation.items()
         }
 
-        variableReplacement, _ = template.tileConstraint.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel,
-                                                                            ctxt, unraveledOpRepr)
+        tileConstr: TileConstraint = template.tileConstraint
+        transfers = {
+            tensorName: memTransfers[self.targetMemLevel]
+            for tensorName, memTransfers in baseExecutionBlock.transfers.items()
+        }
+        variableReplacement, _ = tileConstr.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel, ctxt,
+                                                               unraveledOpRepr, transfers)
 
         minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
         operatorRepresentation.update(newOpRepr)
diff --git a/Deeploy/TilingExtension/TileConstraint.py b/Deeploy/TilingExtension/TileConstraint.py
index 5b067b2ce9..09c0d6e8a7 100644
--- a/Deeploy/TilingExtension/TileConstraint.py
+++ b/Deeploy/TilingExtension/TileConstraint.py
@@ -2,18 +2,15 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-import copy
 from abc import abstractmethod
 from typing import Dict, List, Optional, Tuple, Union
 
-import numpy as np
 from ortools.constraint_solver.pywrapcp import IntVar
 
 from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
-from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint, NodeMemoryConstraint, TensorMemoryConstraint
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TilerModel import TilerModel
-from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, MemoryTransfer, \
-    TilingSchedule, VariableReplacementScheme, computeTileHyperRectangles
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
 
 
 class TileConstraint():
@@ -91,81 +88,17 @@ def sanitizeTilingSchedule(tilingSchedule: TilingSchedule) -> TilingSchedule:
 
     @classmethod
     def wrapTilingSolution(
-            cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext,
-            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]:
-
-        def getMemoryTransfer(tensorConstraint: TensorMemoryConstraint, sourceCube: HyperRectangle,
-                              sourceMemoryLevel: str, targetMemoryLevel: str) -> MemoryTransfer:
-
-            size = np.prod(sourceCube.dims)
-            sourceConstraint = MemoryConstraint(sourceMemoryLevel, size)
-            sourceConstraint.shape = sourceCube.dims
-
-            destConstraint = copy.copy(tensorConstraint.memoryConstraints[targetMemoryLevel])
-
-            if any(dim1 > dim2 for dim1, dim2 in zip(destConstraint.shape, sourceConstraint.shape)):
-                destConstraint.shape = sourceConstraint.shape
-
-            return MemoryTransfer(sourceConstraint, destConstraint)
-
-        def _offsetAdd(offsetA: Tuple[int, ...], offsetB: Tuple[int, ...]) -> Tuple[int, ...]:
-            return tuple(dimA + dimB for dimA, dimB in zip(offsetA, offsetB))
-
-        def getCubeTransfers(tensorConstraint: TensorMemoryConstraint, sourceCubes: List[AbsoluteHyperRectangle],
-                             sourceMemoryLevel: str,
-                             targetMemoryLevel: str) -> Tuple[List[AbsoluteHyperRectangle], List[int]]:
-            solution = []
-            solutionLengths = []
-
-            for sourceCube in sourceCubes:
-                memTransfer = getMemoryTransfer(tensorConstraint, sourceCube.rectangle, sourceMemoryLevel,
-                                                targetMemoryLevel)
-                solutionCubes = computeTileHyperRectangles(memTransfer)
-                solutionAbsoluteCubes = [
-                    AbsoluteHyperRectangle(rectangle = cube,
-                                           absoluteOffset = _offsetAdd(sourceCube.absoluteOffset, cube.offset))
-                    for cube in solutionCubes
-                ]
-                solution += solutionAbsoluteCubes
-                solutionLengths.append(len(solutionAbsoluteCubes))
-
-            return solution, solutionLengths
-
+        cls, tilingSolution: NodeMemoryConstraint, targetMemLevel: str, ctxt: NetworkContext,
+        operatorRepresentation: OperatorRepresentation,
+        transfers: Dict[str,
+                        List[List[AbsoluteHyperRectangle]]]) -> Tuple[VariableReplacementScheme, List[TilingSchedule]]:
         assert len(tilingSolution.outputTensorMemoryConstraints) == 1, "Expected node to have only one output!"
-
-        outVar, outTensorConstraint = next(iter(tilingSolution.outputTensorMemoryConstraints.items()))
-        memoryPath = list(outTensorConstraint.memoryConstraints.keys())
-
-        assert targetMemLevel in memoryPath, \
-            f"Target memory level {targetMemLevel} does not exist in the memory path {memoryPath}"
-
-        targetIdx = memoryPath.index(targetMemLevel)
-
-        if targetIdx == 0:
-            # SCHEREMO: Watch out - this happens if inputs are in L(N+1) but outputs only in L(N)
-            targetIdx = 1
-
-        fullShape = ctxt.lookup(outVar).shape
-        initialOffset = (0,) * len(fullShape)
-        outputCubes = [
-            AbsoluteHyperRectangle(rectangle = HyperRectangle(offset = initialOffset, dims = tuple(fullShape)),
-                                   absoluteOffset = initialOffset)
-        ]
-
-        for source, target in zip(memoryPath[:targetIdx], memoryPath[1:targetIdx + 1]):
-            outputCubes, solutionLengths = getCubeTransfers(outTensorConstraint, outputCubes, source, target)
-
-        arrayOfCubes = []
-        _idx = 0
-        for idxLen in solutionLengths:
-            arrayOfCubes += [outputCubes[_idx:_idx + idxLen]]
-            _idx += idxLen
+        outVar, _ = next(iter(tilingSolution.outputTensorMemoryConstraints.items()))
 
         varReplacements = []
         tilingSchedules = []
 
-        for _outputCubes in arrayOfCubes:
-
+        for _outputCubes in transfers[outVar]:
             varReplacement, tilingSchedule = cls.serializeTilingSolution(tilingSolution, _outputCubes, targetMemLevel,
                                                                          ctxt, operatorRepresentation)
             sanitizedTilingSchedule = cls.sanitizeTilingSchedule(tilingSchedule)
diff --git a/Deeploy/TilingExtension/TilerExtension.py b/Deeploy/TilingExtension/TilerExtension.py
index bdae0fbdcf..87884837fc 100644
--- a/Deeploy/TilingExtension/TilerExtension.py
+++ b/Deeploy/TilingExtension/TilerExtension.py
@@ -36,6 +36,7 @@
 from Deeploy.TilingExtension.MemoryScheduler import MemoryBlock, MemoryScheduler
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, computeTileHyperRectangles
 
 TilingSolution = List[PatternMemoryConstraints]
 MemoryMap = Dict[str, List[List[MemoryBlock]]]
@@ -940,6 +941,34 @@ def testMemoryMapCorrectness(self, memoryMap: Dict[str, List[List[MemoryBlock]]]
                 assert stepIdx in range(lifetime[0], lifetime[-1] +
                                         1), f"Invalid memory map! Buffer {tensor.name} is not alive at step {stepIdx}!"
 
+    def getTransfers(self, tensorMc: TensorMemoryConstraint) -> Dict[str, List[List[AbsoluteHyperRectangle]]]:
+        transfers: Dict[str, List[List[AbsoluteHyperRectangle]]] = {}
+        mcs = list(tensorMc.memoryConstraints.items())
+        for (externalMemory, externalMc), (localMemory, localMc) in zip(mcs[:-1], mcs[1:]):
+            # TODO: Should we also use externalMemory as a key in the transfers?
+            if externalMemory not in transfers:
+                assert externalMc.shape is not None
+                shape = externalMc.shape
+                zeroOffset = (0,) * len(shape)
+                externalAbsoluteRectangles = [AbsoluteHyperRectangle(HyperRectangle(zeroOffset, shape), zeroOffset)]
+            else:
+                # Flatten
+                externalAbsoluteRectangles = [rect for _list in transfers[externalMemory] for rect in _list]
+
+            transfers[localMemory] = [[
+                AbsoluteHyperRectangle(rect, tuple(a + b
+                                                   for a, b in zip(extAbsRect.absoluteOffset, rect.offset)))
+                for rect in computeTileHyperRectangles(extAbsRect.rectangle.dims, localMc.shape)
+            ]
+                                      for extAbsRect in externalAbsoluteRectangles]
+        return transfers
+
+    def getIoTransfers(self,
+                       patternMc: PatternMemoryConstraints) -> Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]]:
+        assert len(patternMc.nodeConstraints) == 1, "Only layerwise supported for now!"
+        tMcs = patternMc.nodeConstraints[0].tensorMemoryConstraints
+        return {name: self.getTransfers(mc) for name, mc in tMcs.items()}
+
 
 class TilerDeployerWrapper(NetworkDeployerWrapper):
 
@@ -996,6 +1025,7 @@ def tile(self, tilingSolution: Optional[TilingSolution] = None, memoryMap: Optio
         # SCHEREMO: Annotate execution block with solution
         for layer, pattern in zip(self.layerBinding.values(), tilingSolution):
             layer.mapper.binder.executionBlock.patternMemoryConstraint = pattern
+            layer.mapper.binder.executionBlock.transfers = self.tiler.getIoTransfers(pattern)
 
         # SCHEREMO: Code generation STUB
 
diff --git a/Deeploy/TilingExtension/TilingCodegen.py b/Deeploy/TilingExtension/TilingCodegen.py
index 604ba23c9d..09e5af1346 100644
--- a/Deeploy/TilingExtension/TilingCodegen.py
+++ b/Deeploy/TilingExtension/TilingCodegen.py
@@ -242,18 +242,12 @@ def calculateFlatOffsetInBytes(tile: HyperRectangle, referenceBuffer: VariableBu
         (referenceBuffer._type.referencedType.typeWidth // 8))
 
 
-def computeTileHyperRectangles(memoryTransfer: MemoryTransfer) -> List[HyperRectangle]:
-    assert memoryTransfer.source.shape is not None, "Source transfer shape cannot be undefined!"
-    assert memoryTransfer.destination.shape is not None, "Destination transfer shape cannot be undefined!"
+def computeTileHyperRectangles(externalShape: Tuple[int, ...], localShape: Tuple[int, ...]) -> List[HyperRectangle]:
+    assert len(externalShape) == len(localShape), \
+    f"External and local memory shapes don't have the same number of dimensions! External {externalShape} vs. Local {localShape}"
 
-    assert len(memoryTransfer.source.shape) == len(memoryTransfer.destination.shape), \
-    f"Source and target of memory transfer {memoryTransfer} don't have the same number of dimensions!"
-
-    largeShape = memoryTransfer.source.shape
-    smallShape = memoryTransfer.destination.shape
-
-    for dimIdx, (dimSizeSmall, dimSizeLarge) in enumerate(zip(smallShape, largeShape)):
-        assert dimSizeSmall <= dimSizeLarge, f"smallShape[{dimIdx}] should not be bigger then largeShape[{dimIdx}]. ({dimSizeSmall} > {dimSizeLarge})"
+    # LMACAN: The local shape dimensions are of the local buffer so if the external tile is smaller, that's fine
+    localShape = tuple(min(ext, loc) for ext, loc in zip(externalShape, localShape))
 
     def nextTileIndex(tileIndexEnd: List[int]) -> Generator[List[int]]:
         tileCount = np.prod(tileIndexEnd)
@@ -270,18 +264,18 @@ def nextTileIndex(tileIndexEnd: List[int]) -> Generator[List[int]]:
     tileHyperRectangles = []
 
     tileIndexEnd = [
-        int(np.ceil(dimSizeLarge / dimSizeSmall)) for dimSizeLarge, dimSizeSmall in zip(largeShape, smallShape)
+        int(np.ceil(dimSizeLarge / dimSizeSmall)) for dimSizeLarge, dimSizeSmall in zip(externalShape, localShape)
     ]
     for tileIndex in nextTileIndex(tileIndexEnd):
-        tileOffset = tuple(dimIdx * dimSizeSmall for dimIdx, dimSizeSmall in zip(tileIndex, smallShape))
-        for dimIdx, (dimOffset, dimSizeLarge) in enumerate(zip(tileOffset, largeShape)):
+        tileOffset = tuple(dimIdx * dimSizeSmall for dimIdx, dimSizeSmall in zip(tileIndex, localShape))
+        for dimIdx, (dimOffset, dimSizeLarge) in enumerate(zip(tileOffset, externalShape)):
             assert dimOffset >= 0, f"tileOffset[{dimIdx}] shoud not be smaller then zero ({dimOffset} < 0)"
             assert dimOffset < dimSizeLarge, f"tileOffset[{dimIdx}] should not be bigger or equal then largeShape[{dimIdx}] ({dimOffset} >= {dimSizeLarge})"
 
         tileSize = tuple(
             min(dimSizeSmall, dimSizeLarge - dimOffset)
-            for dimSizeSmall, dimSizeLarge, dimOffset in zip(smallShape, largeShape, tileOffset))
-        for dimIdx, (dimSize, dimSizeSmall) in enumerate(zip(tileSize, smallShape)):
+            for dimSizeSmall, dimSizeLarge, dimOffset in zip(localShape, externalShape, tileOffset))
+        for dimIdx, (dimSize, dimSizeSmall) in enumerate(zip(tileSize, localShape)):
             assert dimSize > 0, f"tileOffset[{dimIdx}] shoud not be smaller or equal then zero ({dimSize} <= 0)"
             assert dimSize <= dimSizeSmall, f"tileSize[{dimIdx}] should not be bigger then smallShape[{dimIdx}] ({dimSize} > {dimSizeSmall})"
 

From 84ba64719542620b509d7f8f41fcbe01e841a822 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Wed, 22 Oct 2025 12:01:36 +0200
Subject: [PATCH 2/6] Add cast external pointer to uint32_t for l3 dma

---
 Deeploy/Targets/PULPOpen/DMA/L3Dma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py
index 849db08576..b7b8787f49 100644
--- a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py
+++ b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py
@@ -22,7 +22,7 @@ class L3Dma(AsyncDma):
     _transferTemplates = {
         2:
             NodeTemplate(
-                "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
+                "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t)${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
             )
     }
     _waitingStrategy = PerTensorWaitingStrategy(L3DmaFuture)

From 0096442f6d4733735c7e96b2ece68b84b86001bc Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Wed, 22 Oct 2025 18:42:13 +0200
Subject: [PATCH 3/6] Remove unused MemoryTransfer

---
 Deeploy/TilingExtension/TilingCodegen.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/Deeploy/TilingExtension/TilingCodegen.py b/Deeploy/TilingExtension/TilingCodegen.py
index 09e5af1346..40dc975ee6 100644
--- a/Deeploy/TilingExtension/TilingCodegen.py
+++ b/Deeploy/TilingExtension/TilingCodegen.py
@@ -11,13 +11,6 @@
 
 from Deeploy.AbstractDataTypes import Pointer
 from Deeploy.DeeployTypes import OperatorRepresentation, VariableBuffer
-from Deeploy.TilingExtension.MemoryConstraints import MemoryConstraint
-
-
-@dataclass
-class MemoryTransfer():
-    source: MemoryConstraint
-    destination: MemoryConstraint
 
 
 @dataclass

From a1973ae03b83286d258338f5d08624a61a19b600 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Wed, 22 Oct 2025 19:05:05 +0200
Subject: [PATCH 4/6] Update changelog

---
 CHANGELOG.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 5421cdf526..0d5ad1fffe 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 ## Unreleased (Planned Release Target: v0.2.1)
 
 ### List of Pull Requests
+- Add tile transfer annotation [#127](https://github.com/pulp-platform/Deeploy/pull/127)
 - Refactor Logging for Improved Debugging [#115](https://github.com/pulp-platform/Deeploy/pull/115)
 - Add reuse-tool as an SPDX license header linter [#113](https://github.com/pulp-platform/Deeploy/pull/113)
 - Bug fixes, API Cleanup and Reduce Compiler Warning on PULP [#112](https://github.com/pulp-platform/Deeploy/pull/112)
@@ -46,6 +47,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Buffer utilities: `checkNumLevels` validation and `sizeInBytes` method
 - Per–memory-level usage tracking and worst-case reporting in `NetworkContext`
 - Memory/I/O summaries and input/output logging in deployers
+- Added transfer annotation of tiled execution blocks
 
 ### Changed
 - Replaced platform-specific tags (`*-amd64`, `*-arm64`) with direct digest references in `Noelware/docker-manifest-action`.
@@ -73,6 +75,8 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Changed types and added correct casts to fix many compiler warnings in the PULP target library
 - Use [reuse-tool](https://github.com/fsfe/reuse-tool) in pre-commit, CI, and Makefile for SPDX license header linting
 - Deployer workflow now uses `prepare(...)` instead of `generateFunction(...)`.
+- Refactored computeTilingRectangles
+- wrapTilingSolution now uses the transfer annotation
 
 ### Fixed
 - Prevent node duplication for graphs generated via GraphSurgeon
@@ -83,6 +87,7 @@ This file contains the changelog for the Deeploy project. The changelog is divid
 - Corrected method usage in `importDeeployState` to call `NetworkContext.importNetworkContext` instead of the incorrect method name
 - Correctly return `signProp` from `setupDeployer` instead of hardcoding the value to `False` in `testMVP.py`
 - Fixed `Unsqueeze` Op. when using ONNX opset 13 or higher (from attribute to input)
+- Fixed compiler warning by casting the external pointer in L3Dma to uint32_t
 
 ### Removed
 - Delete outdated and unused `.gitlab-ci.yml` file

From 45554e2e7197bcca3cb1489aa2fad69df78ac4f9 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 23 Oct 2025 00:59:38 +0200
Subject: [PATCH 5/6] Skip tiling codegen and var replacement if the target
 memory doesn't have transfers

---
 .../TilingCodeGeneration.py                   | 24 +++++++++------
 .../TilingVariableReplacement.py              | 30 ++++++++++++-------
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
index 11ba8f0a40..1914083f94 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
@@ -5,7 +5,7 @@
 import copy
 import math
 from abc import abstractmethod
-from typing import List, Optional, Tuple, TypeVar
+from typing import Dict, List, Optional, Tuple, TypeVar
 
 import numpy as np
 
@@ -20,8 +20,9 @@
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingPrototypes import PrototypeTilingMixIn
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint, TensorMemoryConstraint
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
-from Deeploy.TilingExtension.TilingCodegen import HyperRectangle, TilingSchedule, VariableReplacementScheme, \
-    calculateFlatOffset, minimizeRectangle, minimizeVariableReplacement, padOffset, padShape, stridesFromShape
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme, calculateFlatOffset, minimizeRectangle, minimizeVariableReplacement, padOffset, \
+    padShape, stridesFromShape
 
 T = TypeVar('T')
 
@@ -242,13 +243,18 @@ def apply(self,
                 assert isinstance(buffer, VariableBuffer)
                 unraveledOpRepr[key] = ctxt.unravelReference(buffer).name
 
-        tileConstraint: TileConstraint = template.tileConstraint
-        transfers = {
-            tensorName: memTransfers[self.localMemory]
-            for tensorName, memTransfers in baseExecutionBlock.transfers.items()
+        tileConstr: TileConstraint = template.tileConstraint
+        transfers: Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]] = baseExecutionBlock.transfers
+        targetMemoryTransfers = {
+            tensorName: memTransfers.get(self.localMemory, None) for tensorName, memTransfers in transfers.items()
         }
-        variableReplacement, tilingSchedules = tileConstraint.wrapTilingSolution(nodeMemoryConstraint, self.localMemory,
-                                                                                 ctxt, unraveledOpRepr, transfers)
+
+        if any(v is None for v in targetMemoryTransfers.values()):
+            return ctxt, executionBlock
+
+        variableReplacement, tilingSchedules = tileConstr.wrapTilingSolution(nodeMemoryConstraint, self.localMemory,
+                                                                             ctxt, unraveledOpRepr,
+                                                                             targetMemoryTransfers)
 
         minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
 
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
index 803f0805d6..cbc0ce57cc 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingVariableReplacement.py
@@ -4,7 +4,7 @@
 
 import copy
 import itertools
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import Struct
 from Deeploy.CommonExtensions.CodeTransformationPasses.Closure import ClosureExecutionBlock
@@ -17,7 +17,8 @@
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerExtension import Tiler
-from Deeploy.TilingExtension.TilingCodegen import TilingSchedule, VariableReplacementScheme, minimizeVariableReplacement
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme, \
+    minimizeVariableReplacement
 
 
 class TilingVariableReplacement(CodeTransformationPass, IntrospectiveCodeTransformationMixIn, TilingHoistingMixIn):
@@ -135,12 +136,17 @@ def apply(self,
         }
 
         tileConstr: TileConstraint = template.tileConstraint
-        transfers = {
-            tensorName: memTransfers[self.targetMemLevel]
-            for tensorName, memTransfers in baseExecutionBlock.transfers.items()
+        transfers: Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]] = baseExecutionBlock.transfers
+        targetMemoryTransfers = {
+            tensorName: memTransfers.get(self.targetMemLevel, None) for tensorName, memTransfers in transfers.items()
         }
+
+        if any(v is None for v in targetMemoryTransfers.values()):
+            return ctxt, executionBlock
+
         variableReplacement, tilingSchedules = tileConstr.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel,
-                                                                             ctxt, unraveledOpRepr, transfers)
+                                                                             ctxt, unraveledOpRepr,
+                                                                             targetMemoryTransfers)
 
         minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
         operatorRepresentation.update(newOpRepr)
@@ -240,12 +246,16 @@ def apply(self,
         }
 
         tileConstr: TileConstraint = template.tileConstraint
-        transfers = {
-            tensorName: memTransfers[self.targetMemLevel]
-            for tensorName, memTransfers in baseExecutionBlock.transfers.items()
+        transfers: Dict[str, Dict[str, List[List[AbsoluteHyperRectangle]]]] = baseExecutionBlock.transfers
+        targetMemoryTransfers = {
+            tensorName: memTransfers.get(self.targetMemLevel, None) for tensorName, memTransfers in transfers.items()
         }
+
+        if any(v is None for v in targetMemoryTransfers.values()):
+            return ctxt, executionBlock
+
         variableReplacement, _ = tileConstr.wrapTilingSolution(nodeMemoryConstraint, self.targetMemLevel, ctxt,
-                                                               unraveledOpRepr, transfers)
+                                                               unraveledOpRepr, targetMemoryTransfers)
 
         minimalVariableReplacement, newOpRepr = minimizeVariableReplacement(variableReplacement, operatorRepresentation)
         operatorRepresentation.update(newOpRepr)

From cf6736295d606176d2e3fb92b34923464bc38ad2 Mon Sep 17 00:00:00 2001
From: Luka Macan <luka.macan@unibo.it>
Date: Thu, 23 Oct 2025 13:51:22 +0200
Subject: [PATCH 6/6] Initialize TransientBuffers on allocation instead of on
 top of network execution code

---
 .../CodeTransformationPasses/MemoryAllocation.py                | 2 ++
 Deeploy/DeeployTypes.py                                         | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
index b73fcafe31..e392e3355b 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
@@ -130,6 +130,8 @@ def apply(self,
                 ctxt._dynamicSize[memoryLevel] += int(buffer.sizeInBytes())
 
             executionBlock.addLeft(buffer.allocTemplate, buffer._bufferRepresentation())
+            if isinstance(buffer, TransientBuffer):
+                executionBlock.addLeft(buffer.initTemplate, buffer._bufferRepresentation())
 
         for levels in ctxt._dynamicSize.keys():
             if levels not in ctxt._maxDynamicSize:
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index a339e3afe1..3783503931 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -2893,7 +2893,7 @@ def generateInferenceInitializationCode(self) -> str:
         callStack = ''
         for node in self.ctxt.localObjects.values():
             # WIESEP: We don't want to initialize the struct buffers as this should be handled by the ArgumentStructGeneration
-            if isinstance(node, StructBuffer):
+            if isinstance(node, (StructBuffer, TransientBuffer)):
                 continue
 
             name = node.name