diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
index 9ebad6c4..d0ef93ba 100644
--- a/Deeploy/Targets/Generic/Layers.py
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -358,7 +358,7 @@ def __init__(self, maps: List[NodeMapper]):
     def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
                       channels_first) -> Tuple[Shape, Shape]:
         if len(inputShapes) == 3:
-            inputShapes[2] = inputShapes[1][0]
+            inputShapes[2] = (inputShapes[1][0],)
         return (inputShapes, outputShapes)
 
     def computeOps(self):
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index bd0e2976..f73be997 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -252,7 +252,7 @@
 
 PULPFloatConvGradX2DBindings = [
     NodeBinding(ConvChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
-                FloatConvGradTemplate.referenceConvGradX2DIm2ColTiledTemplate, ForkTransformer)
+                FloatConvGradTemplate.referenceConvGradX2DTemplate, ForkTransformer)
 ]
 
 PULPFloatDWConv2DBindings = [
diff --git a/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_optimizer/network.onnx b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_optimizer/network.onnx
new file mode 100644
index 00000000..a3ad349c
Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_optimizer/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/inputs.npz b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/inputs.npz
new file mode 100644
index 00000000..29393bb0
Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/network.onnx b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/network.onnx
new file mode 100644
index 00000000..5b4150e1
Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/outputs.npz b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/outputs.npz
new file mode 100644
index 00000000..c6faf181
Binary files /dev/null and b/DeeployTest/Tests/Models/Training/SpeechNet/speechnet_train/outputs.npz differ
diff --git a/DeeployTest/testUtils/codeGenerateTraining.py b/DeeployTest/testUtils/codeGenerateTraining.py
index 25eb23b0..418fa41a 100644
--- a/DeeployTest/testUtils/codeGenerateTraining.py
+++ b/DeeployTest/testUtils/codeGenerateTraining.py
@@ -127,7 +127,7 @@ def generateTrainingTestInputsHeader(deployer: NetworkDeployer,
                 paddingElements = (pad_bytes * 8 + typeWidth - 1) // typeWidth
                 list_str += ", " + ", ".join("0" for _ in range(paddingElements))
 
-            retStr += f"{typeName} {buf_name}[] = {{{list_str}}};\n"
+            retStr += f'__attribute__((section(".weightmem_sram"))) {typeName} {buf_name}[] = {{{list_str}}};\n'
 
         # Emit the row pointer array for this mini-batch
         row_name = f"testDataRow{mb}"
@@ -503,7 +503,10 @@ def build_shared_buffer_maps(train_onnx_path: str, opt_onnx_model) -> Tuple[Dict
     return shared_input_map, shared_output_map
 
 
-def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_output_map: Dict[int, int]) -> str:
+def _patch_shared_buffers(retStr: str,
+                          shared_input_map: Dict[int, int],
+                          shared_output_map: Dict[int, int],
+                          train_c_source: str = "") -> str:
     """Redirect optimizer I/O buffers to Training's already-allocated buffers.
 
     Must be called AFTER the _TRAIN_PREFIX → _OPT_PREFIX substitution so that
@@ -558,12 +561,25 @@ def _patch_shared_buffers(retStr: str, shared_input_map: Dict[int, int], shared_
     _arena_pat = re.compile(r'(DeeployOptNetwork_(input|output)_(\d+))\s*=\s*\([^)]+\s*\*\s*\)'
                             r'\s*\(\s*\(char\s*\*\)\s*DeeployOptNetwork_MEMORYARENA_L\w+\s*\+\s*\d+\s*\)\s*;')
 
+    def _is_train_l2(train_idx: int) -> bool:
+        """Check if training input_N was allocated with pi_l2_malloc (promoted).
+        If so, sharing the pointer would send an L2 address to the optimizer's
+        closure_L3 pi_cl_ram_copy_2d → HyperRAM OOB."""
+        if not train_c_source:
+            return False
+        pat = rf'{_TRAIN_PREFIX}input_{train_idx}\s*=\s*\([^)]+\)\s*pi_l2_malloc\b'
+        return bool(re.search(pat, train_c_source))
+
     def _make_replacement(symbol: str, kind: str, idx: int) -> Optional[str]:
         if kind == "input" and idx in shared_input_map:
             train_idx = shared_input_map[idx]
+            if _is_train_l2(train_idx):
+                return None  # Don't share: training buffer at L2, optimizer expects L3
             return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx};  /* shared with TrainingNetwork */'
         if kind == "output" and idx in shared_output_map:
             train_idx = shared_output_map[idx]
+            if _is_train_l2(train_idx):
+                return None
             return f'{symbol} = (float32_t *){_TRAIN_PREFIX}input_{train_idx};  /* in-place, shared with TrainingNetwork */'
         return None
 
@@ -574,6 +590,41 @@ def _replace(m: re.Match) -> str:
     retStr = _malloc_pat.sub(_replace, retStr)
     retStr = _arena_pat.sub(_replace, retStr)
 
+    # ------------------------------------------------------------------
+    # Drop load_file_to_ram() for shared I/O buffers.
+    #
+    # InitOptimizerNetwork() emits one line per input:
+    #
+    #     load_file_to_ram(DeeployOptNetwork_input_N, "N.hex");
+    #
+    # which expands to cl_ram_write(addr, ...). cl_ram_write expects
+    # `addr` to be a hyperram (L3) offset; the underlying DMA engine
+    # masks it to the hyperram address range. For a shared input that
+    # has been redirected (above) to a TrainingNetwork buffer, the
+    # destination address is whatever level that buffer lives in -- and
+    # once PromoteTensorsToL2 starts hoisting training inputs to L2,
+    # that pointer is an L2 address. Stripping it to a hyperram offset
+    # yields nonsense (e.g. 0x10800000 -> 0x800000) which GVSoC reports
+    # as `/ram out-of-bound request (addr 0x800000, ram_size 0x800000)`
+    # and the simulation aborts.
+    #
+    # These loads are also dead code: the test harness re-initialises
+    # every shared input via l3_aware_copy(testInitWeights[]) after both
+    # InitTrainingNetwork() and InitOptimizerNetwork() return, and that
+    # helper picks the right L2/L3 writer per buffer.
+    _load_pat = re.compile(r'[^\n]*load_file_to_ram\s*\(\s*DeeployOptNetwork_(input|output)_(\d+)\s*,[^;]+\);\s*\n')
+
+    def _maybe_drop_load(m: re.Match) -> str:
+        kind = m.group(1)
+        idx = int(m.group(2))
+        if kind == "input" and idx in shared_input_map:
+            return ''
+        if kind == "output" and idx in shared_output_map:
+            return ''
+        return m.group(0)
+
+    retStr = _load_pat.sub(_maybe_drop_load, retStr)
+
     # ------------------------------------------------------------------
     # Arena elimination: if a MEMORYARENA_Lx is no longer used for any
     # pointer arithmetic after the redirects, its malloc is dead and can
@@ -852,7 +903,10 @@ def generateOptimizerNetworkImplementation(deployer: NetworkDeployer,
     # Prefix substitution
     retStr = retStr.replace(_TRAIN_PREFIX, _OPT_PREFIX)
     # Replace malloc calls for shared weight/grad buffers with Training pointers
-    retStr = _patch_shared_buffers(retStr, shared_input_map or {}, shared_output_map or {})
+    retStr = _patch_shared_buffers(retStr,
+                                   shared_input_map or {},
+                                   shared_output_map or {},
+                                   train_c_source = train_c_source or "")
     # Redirect optimizer L1/L2 arena mallocs to reuse training arenas
     if train_c_source:
         retStr = _patch_shared_arenas(retStr, train_c_source)
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index bafa6635..90eaf13c 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -161,6 +161,7 @@
     "Models/Training/SimpleMLP/simplemlp_train": [64000],
     "Models/Training/Autoencoder/autoencoder_train": [128000],
     "Models/Training/DSCNN/dscnn_train": [128000, 64000],
+    "Models/Training/SpeechNet/speechnet_train": [128000],
 }
 
 # Training-enabled tiled models that need L3 spill (weights/activations don't
diff --git a/docs/tutorial_speechnet_training.ipynb b/docs/tutorial_speechnet_training.ipynb
new file mode 100644
index 00000000..266010b0
--- /dev/null
+++ b/docs/tutorial_speechnet_training.ipynb
@@ -0,0 +1,380 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# SpeechNet On-Device Training Tutorial\n",
+    "\n",
+    "This tutorial walks through the complete pipeline for deploying **SpeechNet** (a lightweight CNN for EMG-based silent speech recognition) on the **Siracusa RISC-V MCU** using **Deeploy**.\n",
+    "\n",
+    "You will learn:\n",
+    "1. How to define a Deeploy-friendly PyTorch model\n",
+    "2. How to export inference and training ONNX graphs using Onnx4Deeploy\n",
+    "3. How to run untiled and tiled Deeploy deployment on Siracusa (GVSoC)\n",
+    "4. Key design decisions and pitfalls\n",
+    "\n",
+    "**Prerequisites**: Familiarity with PyTorch, ONNX, and basic knowledge of RISC-V MCU architectures.\n",
+    "\n",
+    "**Reference**: Spacone et al., \"SilentWear: an Ultra-Low Power Wearable System for EMG-based Silent Speech Recognition\", arXiv: 2603.02847."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Model Architecture\n",
+    "\n",
+    "SpeechNet is a 5-block CNN processing 14-channel EMG signals:\n",
+    "\n",
+    "| Block | Conv kernel | In→Out channels | Output shape |\n",
+    "|-------|------------|-----------------|-------------|\n",
+    "| 0 | (1, 4) | 1 → 8 | (8, 14, 87) after AvgPool(1,8) |\n",
+    "| 1 | (1, 16) | 8 → 16 | (16, 14, 22) after AvgPool(1,4) |\n",
+    "| 2 | (1, 8) | 16 → 16 | (16, 14, 5) after AvgPool(1,4) |\n",
+    "| 3 | (7, 1) | 16 → 32 | (32, 8, 5) after AvgPool(1,1) |\n",
+    "| 4 | (7, 1) | 32 → 32 | (32, 2, 5) after AvgPool(1,1) |\n",
+    "\n",
+    "Followed by GlobalAvgPool → Reshape → Linear(32, 9).\n",
+    "\n",
+    "Total: ~15K parameters, 9 output classes (8 speech commands + rest)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Defining a Deeploy-Friendly PyTorch Model\n",
+    "\n",
+    "When designing a model for Deeploy deployment, follow these rules:\n",
+    "\n",
+    "### Rule 1: No dynamic ONNX ops\n",
+    "Avoid `torch.flatten()`, `x.size()`, `x.shape[N]` in the forward pass. These generate dynamic `Shape`/`Gather`/`Flatten` ops in ONNX that Deeploy cannot handle.\n",
+    "\n",
+    "**Bad:**\n",
+    "```python\n",
+    "x = torch.flatten(x, 1)  # generates Flatten + Shape in backward\n",
+    "```\n",
+    "\n",
+    "**Good:**\n",
+    "```python\n",
+    "x = x.reshape(1, self._fc_in)  # static reshape, batch=1 for deployment\n",
+    "```\n",
+    "\n",
+    "### Rule 2: Use AvgPool instead of MaxPool\n",
+    "MaxPool gradient requires index storage. AvgPool gradient is a simple scatter-divide.\n",
+    "\n",
+    "### Rule 3: No Dropout\n",
+    "Dropout is a no-op at inference and unnecessary for on-device fine-tuning."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "from typing import Any, Dict, List, Optional\n",
+    "\n",
+    "\n",
+    "class SpeechNetDeploy(nn.Module):\n",
+    "    \"\"\"Deployment-ready SpeechNet for Deeploy on PULP MCUs.\"\"\"\n",
+    "\n",
+    "    def __init__(\n",
+    "        self,\n",
+    "        num_channels: int = 14,\n",
+    "        time_steps: int = 700,\n",
+    "        num_classes: int = 9,\n",
+    "        blocks_config: Optional[List[Dict[str, Any]]] = None,\n",
+    "    ):\n",
+    "        super().__init__()\n",
+    "        if blocks_config is None:\n",
+    "            blocks_config = [\n",
+    "                dict(out_channels=8, kernel=(1, 4), pool=(1, 8)),\n",
+    "                dict(out_channels=16, kernel=(1, 16), pool=(1, 4)),\n",
+    "                dict(out_channels=16, kernel=(1, 8), pool=(1, 4)),\n",
+    "                dict(out_channels=32, kernel=(7, 1), pool=(1, 1)),\n",
+    "                dict(out_channels=32, kernel=(7, 1), pool=(1, 1)),\n",
+    "            ]\n",
+    "\n",
+    "        self.blocks = nn.ModuleList()\n",
+    "        in_ch = 1\n",
+    "        for cfg in blocks_config:\n",
+    "            out_ch = cfg[\"out_channels\"]\n",
+    "            k_c, k_t = cfg[\"kernel\"]\n",
+    "            pool_c, pool_t = cfg.get(\"pool\", (1, 1))\n",
+    "            layers = [\n",
+    "                nn.Conv2d(in_ch, out_ch, kernel_size=(k_c, k_t),\n",
+    "                          padding=(0, k_t // 2), bias=True),\n",
+    "                nn.BatchNorm2d(out_ch),\n",
+    "                nn.ReLU(inplace=False),  # inplace=False for clean ONNX\n",
+    "                nn.AvgPool2d(kernel_size=(pool_c, pool_t),\n",
+    "                             stride=(pool_c, pool_t)),\n",
+    "            ]\n",
+    "            self.blocks.append(nn.Sequential(*layers))\n",
+    "            in_ch = out_ch\n",
+    "\n",
+    "        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))\n",
+    "        self._fc_in = in_ch  # stored as Python int for static reshape\n",
+    "        self.fc = nn.Linear(in_ch, num_classes)\n",
+    "\n",
+    "    def forward(self, x: torch.Tensor) -> torch.Tensor:\n",
+    "        for block in self.blocks:\n",
+    "            x = block(x)\n",
+    "        x = self.global_pool(x)\n",
+    "        # Static reshape: avoids dynamic Shape/Flatten ops in ONNX\n",
+    "        x = x.reshape(1, self._fc_in)\n",
+    "        x = self.fc(x)\n",
+    "        return x\n",
+    "\n",
+    "\n",
+    "model = SpeechNetDeploy()\n",
+    "x = torch.randn(1, 1, 14, 700)\n",
+    "y = model(x)\n",
+    "print(f\"Input: {x.shape} → Output: {y.shape}\")\n",
+    "print(f\"Parameters: {sum(p.numel() for p in model.parameters()):,}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Exporting ONNX with Onnx4Deeploy\n",
+    "\n",
+    "Onnx4Deeploy provides a unified CLI for exporting models to ONNX format compatible with Deeploy.\n",
+    "\n",
+    "### 3.1 Inference Export\n",
+    "\n",
+    "```bash\n",
+    "cd /path/to/Onnx4Deeploy\n",
+    "python Onnx4Deeploy.py -model SpeechNet -mode infer\n",
+    "```\n",
+    "\n",
+    "This produces:\n",
+    "- `onnx/model/speechnet_infer/network.onnx` — inference graph (BN folded into Conv)\n",
+    "- `onnx/model/speechnet_infer/inputs.npz` — test input\n",
+    "- `onnx/model/speechnet_infer/outputs.npz` — reference output\n",
+    "\n",
+    "### 3.2 Training Export\n",
+    "\n",
+    "```bash\n",
+    "python Onnx4Deeploy.py -model SpeechNet -mode train\n",
+    "```\n",
+    "\n",
+    "This produces:\n",
+    "- `onnx/model/speechnet_train/network.onnx` — training graph (forward + backward + gradient accumulation)\n",
+    "- `onnx/model/speechnet_train/inputs.npz` — multi-batch training data\n",
+    "- `onnx/model/speechnet_train/outputs.npz` — reference updated weights + losses\n",
+    "- `onnx/model/speechnet_optimizer/network.onnx` — SGD optimizer graph"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Verify the training ONNX graph structure\n",
+    "import onnx\n",
+    "from collections import Counter\n",
+    "\n",
+    "m = onnx.load(\"onnx/model/speechnet_train/network.onnx\")\n",
+    "c = Counter(n.op_type for n in m.graph.node)\n",
+    "print(f\"Total nodes: {len(m.graph.node)}\")\n",
+    "print(f\"Forward ops: Conv={c['Conv']}, BN={c['BatchNormInternal']}, Relu={c['Relu']}, AvgPool={c['AveragePool']}\")\n",
+    "print(f\"Backward ops: ConvGrad={c['ConvGrad']}, BNGrad={c['BatchNormalizationGrad']}, ReluGrad={c['ReluGrad']}\")\n",
+    "print(f\"Training ops: InPlaceAccumulatorV2={c['InPlaceAccumulatorV2']}, SoftmaxCELoss={c['SoftmaxCrossEntropyLoss']}\")\n",
+    "\n",
+    "# Check for dynamic ops (should be 0)\n",
+    "dynamic_ops = ['Shape', 'Flatten', 'Expand', 'Gather']\n",
+    "bad = {op: c[op] for op in dynamic_ops if c.get(op, 0) > 0}\n",
+    "assert not bad, f\"Dynamic ops found: {bad}\"\n",
+    "print(\"\\n✅ Clean graph — no dynamic ops\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### 3.3 Training Strategies\n",
+    "\n",
+    "You can control which layers are trainable:\n",
+    "\n",
+    "```bash\n",
+    "# Full training (all layers)\n",
+    "python Onnx4Deeploy.py -model SpeechNet -mode train\n",
+    "\n",
+    "# Last-layer only (transfer learning)\n",
+    "python Onnx4Deeploy.py -model SpeechNet -mode train --training-strategy last_layer\n",
+    "```\n",
+    "\n",
+    "The training strategy controls the backward graph size:\n",
+    "\n",
+    "| Strategy | Trainable params | Backward ops | Use case |\n",
+    "|----------|-----------------|-------------|----------|\n",
+    "| `full` | 22 | ConvGrad×5, BNGrad×5, ReluGrad×5, AvgPoolGrad×5 | Full fine-tuning |\n",
+    "| `last_layer` | 2 (fc only) | Gemm backward only | Quick adaptation |\n",
+    "| `custom` | User-defined | Depends on selection | Selective fine-tuning |"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Deploying with Deeploy on Siracusa\n",
+    "\n",
+    "### 4.1 Environment Setup\n",
+    "\n",
+    "```bash\n",
+    "# Activate the TrainDeeploy environment\n",
+    "source /path/to/TrainDeeploy/activate_traindeeploy.sh\n",
+    "cd TrainDeeploy/DeeployTest\n",
+    "```\n",
+    "\n",
+    "### 4.2 Untiled Deployment (Smoke Test)\n",
+    "\n",
+    "Run the untiled version first to verify numerical correctness:\n",
+    "\n",
+    "```bash\n",
+    "python deeployTrainingRunner_siracusa.py \\\n",
+    "    -t /path/to/Onnx4Deeploy/onnx/model/speechnet_train\n",
+    "```\n",
+    "\n",
+    "Expected output:\n",
+    "```\n",
+    "=== Siracusa Training Harness (Phase 2 — with OptimizerNetwork) ===\n",
+    "N_TRAIN_STEPS=4  N_ACCUM_STEPS=1  DATA_INPUTS=2\n",
+    "Initializing TrainingNetwork...\n",
+    "Initializing OptimizerNetwork...\n",
+    "Starting training (4 optimizer steps x 1 accum steps)...\n",
+    "update 1/4  accum 1/1  (mini-batch 0)\n",
+    "...\n",
+    "[loss 0] computed=2.267950  ref=2.267950  diff=0.000000  TOL=0.001000\n",
+    "[loss 1] computed=2.498553  ref=2.498553  diff=0.000000  TOL=0.001000\n",
+    "[loss 2] computed=2.083153  ref=2.083153  diff=0.000000  TOL=0.001000\n",
+    "[loss 3] computed=1.905963  ref=1.905963  diff=0.000000  TOL=0.001000\n",
+    "Errors: 0 out of 4\n",
+    "BENCH train_cycles=285250543 opt_cycles=429083 weight_sram=61956\n",
+    "\n",
+    "✓ Test speechnet_train PASSED - No errors found\n",
+    "```\n",
+    "\n",
+    "### 4.3 Tiled Deployment\n",
+    "\n",
+    "For real MCU deployment, use tiling to fit within L1 memory:\n",
+    "\n",
+    "```bash\n",
+    "python deeployTrainingRunner_tiled_siracusa.py \\\n",
+    "    -t /path/to/Onnx4Deeploy/onnx/model/speechnet_train \\\n",
+    "    --l1 128000 --l2 2000000\n",
+    "```\n",
+    "\n",
+    "The tiler automatically splits large activations into tiles that fit in L1 (128 KB)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Understanding the Tiling Pipeline\n",
+    "\n",
+    "Deeploy's tiling pipeline works as follows:\n",
+    "\n",
+    "```\n",
+    "ONNX graph\n",
+    "    ↓\n",
+    "FrontEnd: graph lowering, node renaming, constant folding\n",
+    "    ↓  \n",
+    "Parse: match each node to a NodeMapper (Parser + Bindings)\n",
+    "    ↓\n",
+    "Broadcast: compute/update tensor shapes\n",
+    "    ↓\n",
+    "TypeCheck: select the best NodeBinding (Template + TypeChecker)\n",
+    "    ↓\n",
+    "Bind: hoist transient buffers (e.g., im2col), set up execution blocks\n",
+    "    ↓\n",
+    "Tile: OR-Tools solver finds tile dimensions under L1/L2 constraints\n",
+    "    ↓\n",
+    "CodeGen: render C code with per-tile DMA + kernel calls\n",
+    "    ↓\n",
+    "Build: compile with LLVM for RISC-V\n",
+    "    ↓\n",
+    "Simulate: run on GVSoC cycle-accurate simulator\n",
+    "```\n",
+    "\n",
+    "### Key concepts:\n",
+    "\n",
+    "- **TileConstraint**: Defines how each op can be tiled (which dims are free, which are pinned)\n",
+    "- **Transient buffers**: Scratch memory needed by kernels (e.g., im2col buffer for Conv)\n",
+    "- **Memory hierarchy**: L1 (128 KB SRAM, fast) → L2 (2 MB SRAM) → L3 (HyperFlash, slow)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Common Pitfalls and Solutions\n",
+    "\n",
+    "### Pitfall 1: `torch.flatten` generates dynamic Shape ops\n",
+    "**Symptom**: Training graph has `Shape` + `Reshape` nodes from Flatten backward.\n",
+    "**Fix**: Use `x.reshape(1, C)` with static dimensions.\n",
+    "\n",
+    "### Pitfall 2: ConvGradX Im2Col buffer exceeds L1\n",
+    "**Symptom**: Tiled training hangs — GVSoC runs but no output.\n",
+    "**Cause**: The Im2Col ConvGradX kernel gets `ctxtBufferSize` from full-op dimensions (e.g., 1.2 MB) but the actual L1 allocation is only ~120 KB. The kernel's `co_block` auto-tuning overestimates → L1 overflow.\n",
+    "**Fix**: Use the naive ConvGradX kernel (`referenceConvGradX2DTemplate`) which doesn't require im2col. Change in `Bindings.py`.\n",
+    "\n",
+    "### Pitfall 3: ConvLayer.computeShapes corrupts bias shape\n",
+    "**Symptom**: `TypeError: 'int' object is not iterable` during graph export.\n",
+    "**Cause**: `inputShapes[2] = inputShapes[1][0]` sets bias shape to a scalar int instead of tuple.\n",
+    "**Fix**: `inputShapes[2] = (inputShapes[1][0],)` in `Layers.py`.\n",
+    "\n",
+    "### Pitfall 4: Multiple GVSoC simulations sharing workdir\n",
+    "**Symptom**: `exitcode: -9` (SIGKILL) — simulations kill each other.\n",
+    "**Fix**: Use `PYTEST_XDIST_WORKER=<unique_id>` to isolate build directories.\n",
+    "\n",
+    "### Pitfall 5: GVSoC stdout is fully buffered\n",
+    "**Symptom**: Simulation runs but no printf output visible.\n",
+    "**Fix**: Use `--trace=cluster/pe0/insn` to force output, or use `ring_tee.py` for bounded trace capture with heartbeat monitoring."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## 7. Debugging with GVSoC Traces\n\nWhen a simulation hangs or produces wrong results, use GVSoC's built-in tracing:\n\n### Trace FC (fabric controller) instructions\n```bash\ngvsoc --target=siracusa --binary=<bin> --work-dir=<dir> \\\n    --trace=fc/insn image flash run 2>trace_fc.txt\n```\nShows every instruction the FC executes. Useful for finding where FC is stuck (e.g., `pi_task_wait_on` = waiting for cluster, `memcpy` = initializing data).\n\n### Trace cluster PE instructions\n```bash\ngvsoc --target=siracusa --binary=<bin> --work-dir=<dir> \\\n    --trace=cluster/pe0/insn image flash run 2>trace_pe0.txt\n```\nShows PE0's instructions. Look for the function name in the trace to identify which kernel is running:\n```\n125461135406: 9037685: [/chip/cluster/pe0/insn] PULP_Conv2d_Im2Col_fp32_fp32_f:0 M 1c031d58 flw ...\n```\n\n### Trace memory accesses (LSU)\n```bash\n--trace=cluster/pe0/lsu\n```\nCatches invalid memory accesses:\n```\nInvalid access (pc: 0x1c01c94c, offset: 0x3c9cf7a9, size: 0x3, is_write: 0)\n```\nThis means a kernel tried to read address `0x3c9cf7a9` which is outside L1/L2 — indicates a buffer overflow or wrong DMA offset.\n\n### Useful trace targets\n\n| Trace flag | What it shows |\n|-----------|--------------|\n| `fc/insn` | FC instruction stream |\n| `cluster/pe0/insn` | Cluster PE0 instructions |\n| `cluster/pe0/lsu` | PE0 memory load/store events |\n| `cluster/dma` | DMA transfer events |\n\n### Tips\n- Redirect trace to a file (`2>trace.txt`) — trace output goes to stderr\n- Use `timeout 30 gvsoc ...` to limit trace duration\n- Look at the **last few lines** of the trace to find where it's stuck\n- Use `llvm-objdump -d <binary>` to map PC addresses to function names"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## 8. Exercises\n\n1. **Export and deploy SpeechNet inference** on Siracusa. Compare the ONNX node count with the training graph.\n\n2. **Try `last_layer` training strategy** — only fine-tune the FC layer. Compare cycle count with full training.\n\n3. **Increase training steps** — export with `--n-batches 16` (or `--n-steps 8 --n-accum 2`). Run on GVSoC and observe how loss evolves over more steps. Does it converge?\n\n4. **Debug a hang**: Intentionally use `torch.flatten(x, 1)` in the model, export training ONNX, and observe what extra ops appear. Then fix it."
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 9. Reference\n",
+    "\n",
+    "- [SilentWear paper](https://arxiv.org/abs/2603.02847)\n",
+    "- [Onnx4Deeploy repo](https://github.com/runwangdl/Onnx4Deeploy) — PR #2: SpeechNet exporter\n",
+    "- [TrainDeeploy repo](https://github.com/runwangdl/TrainDeeploy) — PR #31: SpeechNet training test\n",
+    "- [Deeploy TileConstraint docs](../AI_AGENT/Deeploy_Basics/Deeploy_TileConstraint.md)\n",
+    "- [Deeploy Kernel docs](../AI_AGENT/Deeploy_Basics/Deeploy_Kernel.md)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file