Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Deeploy/Targets/Generic/Layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def __init__(self, maps: List[NodeMapper]):
def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
channels_first) -> Tuple[Shape, Shape]:
if len(inputShapes) == 3:
inputShapes[2] = inputShapes[1][0]
inputShapes[2] = (inputShapes[1][0],)
return (inputShapes, outputShapes)

def computeOps(self):
Expand Down
2 changes: 1 addition & 1 deletion Deeploy/Targets/PULPOpen/Bindings.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@

PULPFloatConvGradX2DBindings = [
NodeBinding(ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
FloatConvGradTemplate.referenceConvGradX2DIm2ColTiledTemplate, ForkTransformer)
FloatConvGradTemplate.referenceConvGradX2DTemplate, ForkTransformer)
]

PULPFloatDWConv2DBindings = [
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
60 changes: 57 additions & 3 deletions DeeployTest/testUtils/codeGenerateTraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ def generateTrainingTestInputsHeader(deployer: NetworkDeployer,
paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth
list_str += ", " + ", ".join("0" for _ in range(paddingElements))

retStr += f"{typeName} {buf_name}[] = {{{list_str}}};\n"
retStr += f'__attribute__((section(".weightmem_sram"))) {typeName} {buf_name}[] = {{{list_str}}};\n'

# Emit the row pointer array for this mini-batch
row_name = f"testDataRow{mb}"
Expand Down Expand Up @@ -503,7 +503,10 @@ def build_shared_buffer_maps(train_onnx_path: str, opt_onnx_model) -> Tuple[Dict
return shared_input_map, shared_output_map


def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int]) -> str:
def _patch_shared_buffers(retStr: str,
shared_input_map: Dict[int, int],
shared_output_map: Dict[int, int],
train_c_source: str = "") -> str:
"""Redirect optimizer I/O buffers to Training's already-allocated buffers.

Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution so that
Expand Down Expand Up @@ -558,12 +561,25 @@ def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_
_arena_pat = re.compile(r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)'
r'\s*\(\s*\(char\s*\*\)\s*DeeployOptNetwork_MEMORYARENA_L\w+\s*\+\s*\d+\s*\)\s*;')

def _is_train_l2(train_idx: int) -> bool:
"""Check if training input_N was allocated with pi_l2_malloc (promoted).
If so, sharing the pointer would send an L2 address to the optimizer's
closure_L3 pi_cl_ram_copy_2d → HyperRAM OOB."""
if not train_c_source:
return False
pat = rf'{_TRAIN_PREFIX}input_{train_idx}\s*=\s*\([^)]+\)\s*pi_l2_malloc\b'
return bool(re.search(pat, train_c_source))

def _make_replacement(symbol: str, kind: str, idx: int) -> Optional[str]:
if kind == "input" and idx in shared_input_map:
train_idx = shared_input_map[idx]
if _is_train_l2(train_idx):
return None # Don't share: training buffer at L2, optimizer expects L3
return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* shared with TrainingNetwork */'
if kind == "output" and idx in shared_output_map:
train_idx = shared_output_map[idx]
if _is_train_l2(train_idx):
return None
return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* in-place, shared with TrainingNetwork */'
return None

Expand All @@ -574,6 +590,41 @@ def _replace(m: re.Match) -> str:
retStr = _malloc_pat.sub(_replace, retStr)
retStr = _arena_pat.sub(_replace, retStr)

# ------------------------------------------------------------------
# Drop load_file_to_ram() for shared I/O buffers.
#
# InitOptimizerNetwork() emits one line per input:
#
# load_file_to_ram(DeeployOptNetwork_input_N, "N.hex");
#
# which expands to cl_ram_write(addr, ...). cl_ram_write expects
# `addr` to be a hyperram (L3) offset; the underlying DMA engine
# masks it to the hyperram address range. For a shared input that
# has been redirected (above) to a TrainingNetwork buffer, the
# destination address is whatever level that buffer lives in -- and
# once PromoteTensorsToL2 starts hoisting training inputs to L2,
# that pointer is an L2 address. Stripping it to a hyperram offset
# yields nonsense (e.g. 0x10800000 -> 0x800000) which GVSoC reports
# as `/ram out-of-bound request (addr 0x800000, ram_size 0x800000)`
# and the simulation aborts.
#
# These loads are also dead code: the test harness re-initialises
# every shared input via l3_aware_copy(testInitWeights[]) after both
# InitTrainingNetwork() and InitOptimizerNetwork() return, and that
# helper picks the right L2/L3 writer per buffer.
_load_pat = re.compile(r'[^\n]*load_file_to_ram\s*\(\s*DeeployOptNetwork_(input|output)_(\d+)\s*,[^;]+\);\s*\n')

def _maybe_drop_load(m: re.Match) -> str:
kind = m.group(1)
idx = int(m.group(2))
if kind == "input" and idx in shared_input_map:
return ''
if kind == "output" and idx in shared_output_map:
return ''
return m.group(0)

retStr = _load_pat.sub(_maybe_drop_load, retStr)

# ------------------------------------------------------------------
# Arena elimination: if a MEMORYARENA_Lx is no longer used for any
# pointer arithmetic after the redirects, its malloc is dead and can
Expand Down Expand Up @@ -852,7 +903,10 @@ def generateOptimizerNetworkImplementation(deployer: NetworkDeployer,
# Prefix substitution
retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX)
# Replace malloc calls for shared weight/grad buffers with Training pointers
retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {})
retStr = _patch_shared_buffers(retStr,
shared_input_map or {},
shared_output_map or {},
train_c_source = train_c_source or "")
# Redirect optimizer L1/L2 arena mallocs to reuse training arenas
if train_c_source:
retStr = _patch_shared_arenas(retStr, train_c_source)
Expand Down
1 change: 1 addition & 0 deletions DeeployTest/test_siracusa_tiled_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@
"Models/Training/SimpleMLP/simplemlp_train": [64000],
"Models/Training/Autoencoder/autoencoder_train": [128000],
"Models/Training/DSCNN/dscnn_train": [128000, 64000],
"Models/Training/SpeechNet/speechnet_train": [128000],
}

# Training-enabled tiled models that need L3 spill (weights/activations don't
Expand Down
Loading
Loading