diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index 9ebad6c4..d0ef93ba 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -358,7 +358,7 @@ def __init__(self, maps: List[NodeMapper]): def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation, channels_first) -> Tuple[Shape, Shape]: if len(inputShapes) == 3: - inputShapes[2] = inputShapes[1][0] + inputShapes[2] = (inputShapes[1][0],) return (inputShapes, outputShapes) def computeOps(self): diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index bd0e2976..f73be997 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -252,7 +252,7 @@ PULPFloatConvGradX2DBindings = [ NodeBinding(ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), - FloatConvGradTemplate.referenceConvGradX2DIm2ColTiledTemplate, ForkTransformer) + FloatConvGradTemplate.referenceConvGradX2DTemplate, ForkTransformer) ] PULPFloatDWConv2DBindings = [ diff --git a/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_optimizer/network.onnx b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_optimizer/network.onnx new file mode 100644 index 00000000..a3ad349c Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_optimizer/network.onnx differ diff --git a/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/inputs.npz b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/inputs.npz new file mode 100644 index 00000000..29393bb0 Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/network.onnx b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/network.onnx new file mode 100644 index 00000000..5b4150e1 Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/network.onnx differ diff --git a/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/outputs.npz b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/outputs.npz new file mode 100644 index 00000000..c6faf181 Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/outputs.npz differ diff --git a/DeeployTest/testUtils/codeGenerateTraining.py b/DeeployTest/testUtils/codeGenerateTraining.py index 25eb23b0..418fa41a 100644 --- a/DeeployTest/testUtils/codeGenerateTraining.py +++ b/DeeployTest/testUtils/codeGenerateTraining.py @@ -127,7 +127,7 @@ def generateTrainingTestInputsHeader(deployer: NetworkDeployer, paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth list_str += ", " + ", ".join("0" for _ in range(paddingElements)) - retStr += f"{typeName} {buf_name}[] = {{{list_str}}};\n" + retStr += f'__attribute__((section(".weightmem_sram"))) {typeName} {buf_name}[] = {{{list_str}}};\n' # Emit the row pointer array for this mini-batch row_name = f"testDataRow{mb}" @@ -503,7 +503,10 @@ def build_shared_buffer_maps(train_onnx_path: str, opt_onnx_model) -> Tuple[Dict return shared_input_map, shared_output_map -def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int]) -> str: +def _patch_shared_buffers(retStr: str, + shared_input_map: Dict[int, int], + shared_output_map: Dict[int, int], + train_c_source: str = "") -> str: """Redirect optimizer I/O buffers to Training's already-allocated buffers. Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution so that @@ -558,12 +561,25 @@ def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_ _arena_pat = re.compile(r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)' r'\s*\(\s*\(char\s*\*\)\s*DeeployOptNetwork_MEMORYARENA_L\w+\s*\+\s*\d+\s*\)\s*;') + def _is_train_l2(train_idx: int) -> bool: + """Check if training input_N was allocated with pi_l2_malloc (promoted). + If so, sharing the pointer would send an L2 address to the optimizer's + closure_L3 pi_cl_ram_copy_2d → HyperRAM OOB.""" + if not train_c_source: + return False + pat = rf'{_TRAIN_PREFIX}input_{train_idx}\s*=\s*\([^)]+\)\s*pi_l2_malloc\b' + return bool(re.search(pat, train_c_source)) + def _make_replacement(symbol: str, kind: str, idx: int) -> Optional[str]: if kind == "input" and idx in shared_input_map: train_idx = shared_input_map[idx] + if _is_train_l2(train_idx): + return None # Don't share: training buffer at L2, optimizer expects L3 return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* shared with TrainingNetwork */' if kind == "output" and idx in shared_output_map: train_idx = shared_output_map[idx] + if _is_train_l2(train_idx): + return None return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx}; /* in-place, shared with TrainingNetwork */' return None @@ -574,6 +590,41 @@ def _replace(m: re.Match) -> str: retStr = _malloc_pat.sub(_replace, retStr) retStr = _arena_pat.sub(_replace, retStr) + # ------------------------------------------------------------------ + # Drop load_file_to_ram() for shared I/O buffers. + # + # InitOptimizerNetwork() emits one line per input: + # + # load_file_to_ram(DeeployOptNetwork_input_N, "N.hex"); + # + # which expands to cl_ram_write(addr, ...). cl_ram_write expects + # `addr` to be a hyperram (L3) offset; the underlying DMA engine + # masks it to the hyperram address range. For a shared input that + # has been redirected (above) to a TrainingNetwork buffer, the + # destination address is whatever level that buffer lives in -- and + # once PromoteTensorsToL2 starts hoisting training inputs to L2, + # that pointer is an L2 address. Stripping it to a hyperram offset + # yields nonsense (e.g. 0x10800000 -> 0x800000) which GVSoC reports + # as `/ram out-of-bound request (addr 0x800000, ram_size 0x800000)` + # and the simulation aborts. + # + # These loads are also dead code: the test harness re-initialises + # every shared input via l3_aware_copy(testInitWeights[]) after both + # InitTrainingNetwork() and InitOptimizerNetwork() return, and that + # helper picks the right L2/L3 writer per buffer. + _load_pat = re.compile(r'[^\n]*load_file_to_ram\s*\(\s*DeeployOptNetwork_(input|output)_(\d+)\s*,[^;]+\);\s*\n') + + def _maybe_drop_load(m: re.Match) -> str: + kind = m.group(1) + idx = int(m.group(2)) + if kind == "input" and idx in shared_input_map: + return '' + if kind == "output" and idx in shared_output_map: + return '' + return m.group(0) + + retStr = _load_pat.sub(_maybe_drop_load, retStr) + # ------------------------------------------------------------------ # Arena elimination: if a MEMORYARENA_Lx is no longer used for any # pointer arithmetic after the redirects, its malloc is dead and can @@ -852,7 +903,10 @@ def generateOptimizerNetworkImplementation(deployer: NetworkDeployer, # Prefix substitution retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX) # Replace malloc calls for shared weight/grad buffers with Training pointers - retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {}) + retStr = _patch_shared_buffers(retStr, + shared_input_map or {}, + shared_output_map or {}, + train_c_source = train_c_source or "") # Redirect optimizer L1/L2 arena mallocs to reuse training arenas if train_c_source: retStr = _patch_shared_arenas(retStr, train_c_source) diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index bafa6635..90eaf13c 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -161,6 +161,7 @@ "Models/Training/SimpleMLP/simplemlp_train": [64000], "Models/Training/Autoencoder/autoencoder_train": [128000], "Models/Training/DSCNN/dscnn_train": [128000, 64000], + "Models/Training/SpeechNet/speechnet_train": [128000], } # Training-enabled tiled models that need L3 spill (weights/activations don't diff --git a/docs/tutorial_speechnet_training.ipynb b/docs/tutorial_speechnet_training.ipynb new file mode 100644 index 00000000..266010b0 --- /dev/null +++ b/docs/tutorial_speechnet_training.ipynb @@ -0,0 +1,380 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SpeechNet On-Device Training Tutorial\n", + "\n", + "This tutorial walks through the complete pipeline for deploying **SpeechNet** (a lightweight CNN for EMG-based silent speech recognition) on the **Siracusa RISC-V MCU** using **Deeploy**.\n", + "\n", + "You will learn:\n", + "1. How to define a Deeploy-friendly PyTorch model\n", + "2. How to export inference and training ONNX graphs using Onnx4Deeploy\n", + "3. How to run untiled and tiled Deeploy deployment on Siracusa (GVSoC)\n", + "4. Key design decisions and pitfalls\n", + "\n", + "**Prerequisites**: Familiarity with PyTorch, ONNX, and basic knowledge of RISC-V MCU architectures.\n", + "\n", + "**Reference**: Spacone et al., \"SilentWear: an Ultra-Low Power Wearable System for EMG-based Silent Speech Recognition\", arXiv: 2603.02847." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Model Architecture\n", + "\n", + "SpeechNet is a 5-block CNN processing 14-channel EMG signals:\n", + "\n", + "| Block | Conv kernel | In→Out channels | Output shape |\n", + "|-------|------------|-----------------|-------------|\n", + "| 0 | (1, 4) | 1 → 8 | (8, 14, 87) after AvgPool(1,8) |\n", + "| 1 | (1, 16) | 8 → 16 | (16, 14, 22) after AvgPool(1,4) |\n", + "| 2 | (1, 8) | 16 → 16 | (16, 14, 5) after AvgPool(1,4) |\n", + "| 3 | (7, 1) | 16 → 32 | (32, 8, 5) after AvgPool(1,1) |\n", + "| 4 | (7, 1) | 32 → 32 | (32, 2, 5) after AvgPool(1,1) |\n", + "\n", + "Followed by GlobalAvgPool → Reshape → Linear(32, 9).\n", + "\n", + "Total: ~15K parameters, 9 output classes (8 speech commands + rest)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Defining a Deeploy-Friendly PyTorch Model\n", + "\n", + "When designing a model for Deeploy deployment, follow these rules:\n", + "\n", + "### Rule 1: No dynamic ONNX ops\n", + "Avoid `torch.flatten()`, `x.size()`, `x.shape[N]` in the forward pass. These generate dynamic `Shape`/`Gather`/`Flatten` ops in ONNX that Deeploy cannot handle.\n", + "\n", + "**Bad:**\n", + "```python\n", + "x = torch.flatten(x, 1) # generates Flatten + Shape in backward\n", + "```\n", + "\n", + "**Good:**\n", + "```python\n", + "x = x.reshape(1, self._fc_in) # static reshape, batch=1 for deployment\n", + "```\n", + "\n", + "### Rule 2: Use AvgPool instead of MaxPool\n", + "MaxPool gradient requires index storage. AvgPool gradient is a simple scatter-divide.\n", + "\n", + "### Rule 3: No Dropout\n", + "Dropout is a no-op at inference and unnecessary for on-device fine-tuning." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "from typing import Any, Dict, List, Optional\n", + "\n", + "\n", + "class SpeechNetDeploy(nn.Module):\n", + " \"\"\"Deployment-ready SpeechNet for Deeploy on PULP MCUs.\"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " num_channels: int = 14,\n", + " time_steps: int = 700,\n", + " num_classes: int = 9,\n", + " blocks_config: Optional[List[Dict[str, Any]]] = None,\n", + " ):\n", + " super().__init__()\n", + " if blocks_config is None:\n", + " blocks_config = [\n", + " dict(out_channels=8, kernel=(1, 4), pool=(1, 8)),\n", + " dict(out_channels=16, kernel=(1, 16), pool=(1, 4)),\n", + " dict(out_channels=16, kernel=(1, 8), pool=(1, 4)),\n", + " dict(out_channels=32, kernel=(7, 1), pool=(1, 1)),\n", + " dict(out_channels=32, kernel=(7, 1), pool=(1, 1)),\n", + " ]\n", + "\n", + " self.blocks = nn.ModuleList()\n", + " in_ch = 1\n", + " for cfg in blocks_config:\n", + " out_ch = cfg[\"out_channels\"]\n", + " k_c, k_t = cfg[\"kernel\"]\n", + " pool_c, pool_t = cfg.get(\"pool\", (1, 1))\n", + " layers = [\n", + " nn.Conv2d(in_ch, out_ch, kernel_size=(k_c, k_t),\n", + " padding=(0, k_t // 2), bias=True),\n", + " nn.BatchNorm2d(out_ch),\n", + " nn.ReLU(inplace=False), # inplace=False for clean ONNX\n", + " nn.AvgPool2d(kernel_size=(pool_c, pool_t),\n", + " stride=(pool_c, pool_t)),\n", + " ]\n", + " self.blocks.append(nn.Sequential(*layers))\n", + " in_ch = out_ch\n", + "\n", + " self.global_pool = nn.AdaptiveAvgPool2d((1, 1))\n", + " self._fc_in = in_ch # stored as Python int for static reshape\n", + " self.fc = nn.Linear(in_ch, num_classes)\n", + "\n", + " def forward(self, x: torch.Tensor) -> torch.Tensor:\n", + " for block in self.blocks:\n", + " x = block(x)\n", + " x = self.global_pool(x)\n", + " # Static reshape: avoids dynamic Shape/Flatten ops in ONNX\n", + " x = x.reshape(1, self._fc_in)\n", + " x = self.fc(x)\n", + " return x\n", + "\n", + "\n", + "model = SpeechNetDeploy()\n", + "x = torch.randn(1, 1, 14, 700)\n", + "y = model(x)\n", + "print(f\"Input: {x.shape} → Output: {y.shape}\")\n", + "print(f\"Parameters: {sum(p.numel() for p in model.parameters()):,}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Exporting ONNX with Onnx4Deeploy\n", + "\n", + "Onnx4Deeploy provides a unified CLI for exporting models to ONNX format compatible with Deeploy.\n", + "\n", + "### 3.1 Inference Export\n", + "\n", + "```bash\n", + "cd /path/to/Onnx4Deeploy\n", + "python Onnx4Deeploy.py -model SpeechNet -mode infer\n", + "```\n", + "\n", + "This produces:\n", + "- `onnx/model/speechnet_infer/network.onnx` — inference graph (BN folded into Conv)\n", + "- `onnx/model/speechnet_infer/inputs.npz` — test input\n", + "- `onnx/model/speechnet_infer/outputs.npz` — reference output\n", + "\n", + "### 3.2 Training Export\n", + "\n", + "```bash\n", + "python Onnx4Deeploy.py -model SpeechNet -mode train\n", + "```\n", + "\n", + "This produces:\n", + "- `onnx/model/speechnet_train/network.onnx` — training graph (forward + backward + gradient accumulation)\n", + "- `onnx/model/speechnet_train/inputs.npz` — multi-batch training data\n", + "- `onnx/model/speechnet_train/outputs.npz` — reference updated weights + losses\n", + "- `onnx/model/speechnet_optimizer/network.onnx` — SGD optimizer graph" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Verify the training ONNX graph structure\n", + "import onnx\n", + "from collections import Counter\n", + "\n", + "m = onnx.load(\"onnx/model/speechnet_train/network.onnx\")\n", + "c = Counter(n.op_type for n in m.graph.node)\n", + "print(f\"Total nodes: {len(m.graph.node)}\")\n", + "print(f\"Forward ops: Conv={c['Conv']}, BN={c['BatchNormInternal']}, Relu={c['Relu']}, AvgPool={c['AveragePool']}\")\n", + "print(f\"Backward ops: ConvGrad={c['ConvGrad']}, BNGrad={c['BatchNormalizationGrad']}, ReluGrad={c['ReluGrad']}\")\n", + "print(f\"Training ops: InPlaceAccumulatorV2={c['InPlaceAccumulatorV2']}, SoftmaxCELoss={c['SoftmaxCrossEntropyLoss']}\")\n", + "\n", + "# Check for dynamic ops (should be 0)\n", + "dynamic_ops = ['Shape', 'Flatten', 'Expand', 'Gather']\n", + "bad = {op: c[op] for op in dynamic_ops if c.get(op, 0) > 0}\n", + "assert not bad, f\"Dynamic ops found: {bad}\"\n", + "print(\"\\n✅ Clean graph — no dynamic ops\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.3 Training Strategies\n", + "\n", + "You can control which layers are trainable:\n", + "\n", + "```bash\n", + "# Full training (all layers)\n", + "python Onnx4Deeploy.py -model SpeechNet -mode train\n", + "\n", + "# Last-layer only (transfer learning)\n", + "python Onnx4Deeploy.py -model SpeechNet -mode train --training-strategy last_layer\n", + "```\n", + "\n", + "The training strategy controls the backward graph size:\n", + "\n", + "| Strategy | Trainable params | Backward ops | Use case |\n", + "|----------|-----------------|-------------|----------|\n", + "| `full` | 22 | ConvGrad×5, BNGrad×5, ReluGrad×5, AvgPoolGrad×5 | Full fine-tuning |\n", + "| `last_layer` | 2 (fc only) | Gemm backward only | Quick adaptation |\n", + "| `custom` | User-defined | Depends on selection | Selective fine-tuning |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Deploying with Deeploy on Siracusa\n", + "\n", + "### 4.1 Environment Setup\n", + "\n", + "```bash\n", + "# Activate the TrainDeeploy environment\n", + "source /path/to/TrainDeeploy/activate_traindeeploy.sh\n", + "cd TrainDeeploy/DeeployTest\n", + "```\n", + "\n", + "### 4.2 Untiled Deployment (Smoke Test)\n", + "\n", + "Run the untiled version first to verify numerical correctness:\n", + "\n", + "```bash\n", + "python deeployTrainingRunner_siracusa.py \\\n", + " -t /path/to/Onnx4Deeploy/onnx/model/speechnet_train\n", + "```\n", + "\n", + "Expected output:\n", + "```\n", + "=== Siracusa Training Harness (Phase 2 — with OptimizerNetwork) ===\n", + "N_TRAIN_STEPS=4 N_ACCUM_STEPS=1 DATA_INPUTS=2\n", + "Initializing TrainingNetwork...\n", + "Initializing OptimizerNetwork...\n", + "Starting training (4 optimizer steps x 1 accum steps)...\n", + "update 1/4 accum 1/1 (mini-batch 0)\n", + "...\n", + "[loss 0] computed=2.267950 ref=2.267950 diff=0.000000 TOL=0.001000\n", + "[loss 1] computed=2.498553 ref=2.498553 diff=0.000000 TOL=0.001000\n", + "[loss 2] computed=2.083153 ref=2.083153 diff=0.000000 TOL=0.001000\n", + "[loss 3] computed=1.905963 ref=1.905963 diff=0.000000 TOL=0.001000\n", + "Errors: 0 out of 4\n", + "BENCH train_cycles=285250543 opt_cycles=429083 weight_sram=61956\n", + "\n", + "✓ Test speechnet_train PASSED - No errors found\n", + "```\n", + "\n", + "### 4.3 Tiled Deployment\n", + "\n", + "For real MCU deployment, use tiling to fit within L1 memory:\n", + "\n", + "```bash\n", + "python deeployTrainingRunner_tiled_siracusa.py \\\n", + " -t /path/to/Onnx4Deeploy/onnx/model/speechnet_train \\\n", + " --l1 128000 --l2 2000000\n", + "```\n", + "\n", + "The tiler automatically splits large activations into tiles that fit in L1 (128 KB)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Understanding the Tiling Pipeline\n", + "\n", + "Deeploy's tiling pipeline works as follows:\n", + "\n", + "```\n", + "ONNX graph\n", + " ↓\n", + "FrontEnd: graph lowering, node renaming, constant folding\n", + " ↓ \n", + "Parse: match each node to a NodeMapper (Parser + Bindings)\n", + " ↓\n", + "Broadcast: compute/update tensor shapes\n", + " ↓\n", + "TypeCheck: select the best NodeBinding (Template + TypeChecker)\n", + " ↓\n", + "Bind: hoist transient buffers (e.g., im2col), set up execution blocks\n", + " ↓\n", + "Tile: OR-Tools solver finds tile dimensions under L1/L2 constraints\n", + " ↓\n", + "CodeGen: render C code with per-tile DMA + kernel calls\n", + " ↓\n", + "Build: compile with LLVM for RISC-V\n", + " ↓\n", + "Simulate: run on GVSoC cycle-accurate simulator\n", + "```\n", + "\n", + "### Key concepts:\n", + "\n", + "- **TileConstraint**: Defines how each op can be tiled (which dims are free, which are pinned)\n", + "- **Transient buffers**: Scratch memory needed by kernels (e.g., im2col buffer for Conv)\n", + "- **Memory hierarchy**: L1 (128 KB SRAM, fast) → L2 (2 MB SRAM) → L3 (HyperFlash, slow)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Common Pitfalls and Solutions\n", + "\n", + "### Pitfall 1: `torch.flatten` generates dynamic Shape ops\n", + "**Symptom**: Training graph has `Shape` + `Reshape` nodes from Flatten backward.\n", + "**Fix**: Use `x.reshape(1, C)` with static dimensions.\n", + "\n", + "### Pitfall 2: ConvGradX Im2Col buffer exceeds L1\n", + "**Symptom**: Tiled training hangs — GVSoC runs but no output.\n", + "**Cause**: The Im2Col ConvGradX kernel gets `ctxtBufferSize` from full-op dimensions (e.g., 1.2 MB) but the actual L1 allocation is only ~120 KB. The kernel's `co_block` auto-tuning overestimates → L1 overflow.\n", + "**Fix**: Use the naive ConvGradX kernel (`referenceConvGradX2DTemplate`) which doesn't require im2col. Change in `Bindings.py`.\n", + "\n", + "### Pitfall 3: ConvLayer.computeShapes corrupts bias shape\n", + "**Symptom**: `TypeError: 'int' object is not iterable` during graph export.\n", + "**Cause**: `inputShapes[2] = inputShapes[1][0]` sets bias shape to a scalar int instead of tuple.\n", + "**Fix**: `inputShapes[2] = (inputShapes[1][0],)` in `Layers.py`.\n", + "\n", + "### Pitfall 4: Multiple GVSoC simulations sharing workdir\n", + "**Symptom**: `exitcode: -9` (SIGKILL) — simulations kill each other.\n", + "**Fix**: Use `PYTEST_XDIST_WORKER=` to isolate build directories.\n", + "\n", + "### Pitfall 5: GVSoC stdout is fully buffered\n", + "**Symptom**: Simulation runs but no printf output visible.\n", + "**Fix**: Use `--trace=cluster/pe0/insn` to force output, or use `ring_tee.py` for bounded trace capture with heartbeat monitoring." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## 7. Debugging with GVSoC Traces\n\nWhen a simulation hangs or produces wrong results, use GVSoC's built-in tracing:\n\n### Trace FC (fabric controller) instructions\n```bash\ngvsoc --target=siracusa --binary= --work-dir= \\\n --trace=fc/insn image flash run 2>trace_fc.txt\n```\nShows every instruction the FC executes. Useful for finding where FC is stuck (e.g., `pi_task_wait_on` = waiting for cluster, `memcpy` = initializing data).\n\n### Trace cluster PE instructions\n```bash\ngvsoc --target=siracusa --binary= --work-dir= \\\n --trace=cluster/pe0/insn image flash run 2>trace_pe0.txt\n```\nShows PE0's instructions. Look for the function name in the trace to identify which kernel is running:\n```\n125461135406: 9037685: [/chip/cluster/pe0/insn] PULP_Conv2d_Im2Col_fp32_fp32_f:0 M 1c031d58 flw ...\n```\n\n### Trace memory accesses (LSU)\n```bash\n--trace=cluster/pe0/lsu\n```\nCatches invalid memory accesses:\n```\nInvalid access (pc: 0x1c01c94c, offset: 0x3c9cf7a9, size: 0x3, is_write: 0)\n```\nThis means a kernel tried to read address `0x3c9cf7a9` which is outside L1/L2 — indicates a buffer overflow or wrong DMA offset.\n\n### Useful trace targets\n\n| Trace flag | What it shows |\n|-----------|--------------|\n| `fc/insn` | FC instruction stream |\n| `cluster/pe0/insn` | Cluster PE0 instructions |\n| `cluster/pe0/lsu` | PE0 memory load/store events |\n| `cluster/dma` | DMA transfer events |\n\n### Tips\n- Redirect trace to a file (`2>trace.txt`) — trace output goes to stderr\n- Use `timeout 30 gvsoc ...` to limit trace duration\n- Look at the **last few lines** of the trace to find where it's stuck\n- Use `llvm-objdump -d ` to map PC addresses to function names" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## 8. Exercises\n\n1. **Export and deploy SpeechNet inference** on Siracusa. Compare the ONNX node count with the training graph.\n\n2. **Try `last_layer` training strategy** — only fine-tune the FC layer. Compare cycle count with full training.\n\n3. **Increase training steps** — export with `--n-batches 16` (or `--n-steps 8 --n-accum 2`). Run on GVSoC and observe how loss evolves over more steps. Does it converge?\n\n4. **Debug a hang**: Intentionally use `torch.flatten(x, 1)` in the model, export training ONNX, and observe what extra ops appear. Then fix it." + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 9. Reference\n", + "\n", + "- [SilentWear paper](https://arxiv.org/abs/2603.02847)\n", + "- [Onnx4Deeploy repo](https://github.com/runwangdl/Onnx4Deeploy) — PR #2: SpeechNet exporter\n", + "- [TrainDeeploy repo](https://github.com/runwangdl/TrainDeeploy) — PR #31: SpeechNet training test\n", + "- [Deeploy TileConstraint docs](../AI_AGENT/Deeploy_Basics/Deeploy_TileConstraint.md)\n", + "- [Deeploy Kernel docs](../AI_AGENT/Deeploy_Basics/Deeploy_Kernel.md)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file