diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml
index b65cbb75..a038d5b3 100644
--- a/.github/workflows/ci-platform-siracusa-tiled.yml
+++ b/.github/workflows/ci-platform-siracusa-tiled.yml
@@ -29,20 +29,22 @@ jobs:
     with:
       docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
 
-  # Training tests - L2 singlebuffer
-  siracusa-training-tiled-l2-singlebuffer:
+  # Training tests - L2 (SB + DB combined so the runner emits a single
+  # SB-vs-DB cycle comparison table to $GITHUB_STEP_SUMMARY).
+  siracusa-training-tiled-l2:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
     with:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "training and l2 and singlebuffer"
+      pytest-marker: "training and l2"
 
-  # Training tests - L3 singlebuffer (models that spill weights/activations to L3)
-  siracusa-training-tiled-l3-singlebuffer:
+  # Training tests - L3 (SB + DB combined; DB uses TrainingDBOnlyL3Tiler so
+  # the L2 staging budget doesn't double).
+  siracusa-training-tiled-l3:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
     with:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "training and l3 and singlebuffer"
+      pytest-marker: "training and l3"
diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml
index 7a4f415e..839af4b4 100644
--- a/.github/workflows/ci-platform-siracusa.yml
+++ b/.github/workflows/ci-platform-siracusa.yml
@@ -6,12 +6,9 @@
 name: CI • Siracusa
 
 "on":
-  push:
-    branches:
-      - "**"
-    tags:
-      - "v*.*.*"
-  pull_request:
+  # Auto-trigger disabled in TrainDeeploy fork: untiled Siracusa is not
+  # exercised by training/DB work. Re-enable by restoring the push: /
+  # pull_request: blocks.
   workflow_dispatch:
     inputs:
       docker_image_deeploy:
diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py
index f29891bf..c247053d 100644
--- a/DeeployTest/conftest.py
+++ b/DeeployTest/conftest.py
@@ -158,3 +158,81 @@ def toolchain(request):
 def cmake_args(request):
     """Return additional CMake arguments."""
     return request.config.getoption("--cmake-args")
+
+
+# ---------------------------------------------------------------------------
+# Training cycle summary: at session end, scan $GITHUB_STEP_SUMMARY for any
+# training cycle section emitted by run_and_assert_test, join SB and DB rows
+# by (test, l1), and append a comparison table with speedup.
+# ---------------------------------------------------------------------------
+def _parse_training_section(section_body: str):
+    """Parse rows of `| test | l1 | mode | train_cycles | opt_cycles | weight_sram |`.
+
+    Returns list of dicts with keys: test, l1, mode, train, opt, sram.
+    """
+    rows = []
+    for line in section_body.splitlines():
+        if not line.startswith("| "):
+            continue
+        if "train_cycles" in line or "------" in line:
+            continue
+        cells = [c.strip() for c in line.strip("|").split("|")]
+        if len(cells) < 6:
+            continue
+        try:
+            train = int(cells[3].replace(",", ""))
+            opt = int(cells[4].replace(",", ""))
+            sram = int(cells[5].replace(",", ""))
+        except ValueError:
+            continue
+        rows.append({"test": cells[0], "l1": cells[1], "mode": cells[2], "train": train, "opt": opt, "sram": sram})
+    return rows
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if not summary_path or not os.path.exists(summary_path):
+        return
+    try:
+        with open(summary_path, "r") as f:
+            existing = f.read()
+    except Exception:
+        return
+
+    # Find every "## Siracusa L? training cycles" section and append a join.
+    import re as _re
+    for heading in _re.findall(r"^## (Siracusa L[23] training cycles)$", existing, flags = _re.MULTILINE):
+        start = existing.find(f"## {heading}")
+        rest = existing[start + len(f"## {heading}"):]
+        next_section = rest.find("\n## ")
+        body = rest if next_section == -1 else rest[:next_section]
+        rows = _parse_training_section(body)
+        if not rows:
+            continue
+        # Join SB and DB rows by (test, l1).
+        by_key: dict = {}
+        for r in rows:
+            by_key.setdefault((r["test"], r["l1"]), {})[r["mode"]] = r
+        try:
+            with open(summary_path, "a") as f:
+                f.write(f"\n### {heading} — SB vs DB speedup\n\n")
+                f.write("| Test | L1 (B) | SB train | DB train | train Δ | SB opt | DB opt | opt Δ |\n")
+                f.write("|------|--------|----------|----------|---------|--------|--------|-------|\n")
+                for (test, l1), modes in sorted(by_key.items()):
+                    sb = modes.get("SB")
+                    db = modes.get("DB")
+                    sb_t = f"{sb['train']:,}" if sb else "—"
+                    db_t = f"{db['train']:,}" if db else "—"
+                    sb_o = f"{sb['opt']:,}" if sb else "—"
+                    db_o = f"{db['opt']:,}" if db else "—"
+                    if sb and db and sb['train'] > 0:
+                        delta_t = f"{(sb['train'] - db['train']) / sb['train'] * 100:+.1f}%"
+                    else:
+                        delta_t = "—"
+                    if sb and db and sb['opt'] > 0:
+                        delta_o = f"{(sb['opt'] - db['opt']) / sb['opt'] * 100:+.1f}%"
+                    else:
+                        delta_o = "—"
+                    f.write(f"| {test} | {l1} | {sb_t} | {db_t} | {delta_t} | {sb_o} | {db_o} | {delta_o} |\n")
+        except Exception:
+            pass
diff --git a/DeeployTest/testMVPOptimizer.py b/DeeployTest/testMVPOptimizer.py
index 0d6497f8..0668aea6 100644
--- a/DeeployTest/testMVPOptimizer.py
+++ b/DeeployTest/testMVPOptimizer.py
@@ -34,7 +34,7 @@
 from testUtils.codeGenerateTraining import build_shared_buffer_maps, generateOptimizerTestNetwork
 from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser
-from testUtils.tilingUtils import TrainingSBTiler
+from testUtils.tilingUtils import TrainingDBTiler, TrainingSBTiler
 from testUtils.trainingUtils import _mockScheduler, add_optimizer_training_dir_arg
 
 from Deeploy.AbstractDataTypes import PointerClass
@@ -102,15 +102,17 @@ def generateTiledOptimizerNetwork(args) -> None:
         AnnotateDefaultMemoryLevel(memoryHierarchy),
     ])
 
-    # 7. Wrap with SBTiler (single-buffering; optimizer is forward-only, no lifetime extension needed).
-    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer"
+    # 7. Wrap with tiler. SB by default; --doublebuffer switches to TrainingDBTiler.
+    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer_DB{args.doublebuffer}"
     testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16]
 
-    # TrainingSBTiler extends all input buffer lifetimes to the end of the
-    # schedule (via TrainingMemoryScheduler).  This prevents the allocator from
-    # reusing the space of a consumed input (e.g. fc1 weight) for a later
-    # output (e.g. fc2 updated weight), which would corrupt the weight buffer.
-    deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir)
+    # TrainingSBTiler/TrainingDBTiler extend all input buffer lifetimes to the
+    # end of the schedule (via TrainingMemoryScheduler).  This prevents the
+    # allocator from reusing the space of a consumed input (e.g. fc1 weight)
+    # for a later output (e.g. fc2 updated weight), which would corrupt the
+    # weight buffer.
+    tilerCls = TrainingDBTiler if args.doublebuffer else TrainingSBTiler
+    deployer = TilerDeployerWrapper(deployer, tilerCls, testName = testIdentifier, workDir = args.dumpdir)
     deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc
     deployer.tiler.memoryAllocStrategy = args.memAllocStrategy
     deployer.tiler.searchStrategy = args.searchStrategy
@@ -159,6 +161,9 @@ def generateTiledOptimizerNetwork(args) -> None:
                         type = str,
                         default = "L2",
                         help = "Default memory level for IO buffers. Default: L2.")
+    parser.add_argument("--doublebuffer",
+                        action = "store_true",
+                        help = "Enable double buffering for tile DMA transfers (TrainingDBTiler).")
     parser.add_argument("--memAllocStrategy",
                         type = str,
                         default = "MiniMalloc",
diff --git a/DeeployTest/testMVPTraining.py b/DeeployTest/testMVPTraining.py
index bf69b090..32ba10bf 100644
--- a/DeeployTest/testMVPTraining.py
+++ b/DeeployTest/testMVPTraining.py
@@ -13,7 +13,7 @@
 from testUtils.codeGenerateTraining import generateTrainingTestNetwork
 from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser
-from testUtils.tilingUtils import TrainingSBTiler
+from testUtils.tilingUtils import TrainingDBOnlyL3Tiler, TrainingDBTiler, TrainingSBTiler
 from testUtils.trainingUtils import _GRAD_ACC, _infer_data_size, _infer_n_accum, _infer_num_data_inputs, \
     _infer_total_mb, _load_reference_losses, _mockScheduler, add_training_inference_args
 from testUtils.typeMapping import inferTypeAndOffset
@@ -132,11 +132,16 @@ def generateTiledTrainingNetwork(args) -> None:
         AnnotateDefaultMemoryLevel(memoryHierarchy),
     ])
 
-    # 9. Wrap with tiler (TrainingSBTiler: SB strategy + extended input lifetimes for backward pass).
-    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}"
+    # 9. Wrap with tiler. SB by default; --doublebuffer switches to TrainingDBTiler
+    #    (DB strategy + same TrainingMemoryScheduler input-lifetime extension).
+    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_DB{args.doublebuffer}"
     testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16]
 
-    deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir)
+    if args.doublebuffer:
+        tilerCls = TrainingDBOnlyL3Tiler if args.defaultMemLevel == "L3" else TrainingDBTiler
+    else:
+        tilerCls = TrainingSBTiler
+    deployer = TilerDeployerWrapper(deployer, tilerCls, testName = testIdentifier, workDir = args.dumpdir)
     deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc
     deployer.tiler.memoryAllocStrategy = args.memAllocStrategy
     deployer.tiler.searchStrategy = args.searchStrategy
@@ -245,6 +250,9 @@ def generateTiledTrainingNetwork(args) -> None:
                         type = str,
                         default = "L2",
                         help = "Default memory level for IO buffers. Default: L2.")
+    parser.add_argument("--doublebuffer",
+                        action = "store_true",
+                        help = "Enable double buffering for tile DMA transfers (TrainingDBTiler).")
     parser.add_argument("--memAllocStrategy",
                         type = str,
                         default = "MiniMalloc",
diff --git a/DeeployTest/testUtils/pytestRunner.py b/DeeployTest/testUtils/pytestRunner.py
index cedc8b7a..960b3031 100644
--- a/DeeployTest/testUtils/pytestRunner.py
+++ b/DeeployTest/testUtils/pytestRunner.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+import re
 from pathlib import Path
 from typing import List, Literal, Optional
 
@@ -17,6 +18,16 @@
     'run_simulation',
 ]
 
+# Tracks which Markdown sections we've already emitted a header for inside the
+# current pytest session. Keeps run_and_assert_test idempotent across
+# parametrised cases that share a section.
+_METRIC_SECTIONS_WRITTEN: set = set()
+
+# `BENCH train_cycles=<N> opt_cycles=<M> weight_sram=<K>` — printed once per
+# training run by the test harness; captured here so we can append a cycles
+# row to $GITHUB_STEP_SUMMARY for SB-vs-DB comparison.
+_TRAIN_BENCH_RE = re.compile(r"BENCH train_cycles=(\d+) opt_cycles=(\d+) weight_sram=(\d+)")
+
 
 def get_worker_id() -> str:
     """
@@ -122,10 +133,53 @@ def create_test_config(
     return config
 
 
-def run_and_assert_test(test_name: str, config: DeeployTestConfig, skipgen: bool, skipsim: bool) -> None:
+def _emit_training_cycle_row(test_name: str, config: DeeployTestConfig, stdout: str, metric_section: str) -> None:
+    """Parse `BENCH train_cycles=...` from the test's stdout and append a row
+    to $GITHUB_STEP_SUMMARY under `## {metric_section}`. The header is emitted
+    once per (section, session) pair via _METRIC_SECTIONS_WRITTEN.
+
+    No-op when not running under GitHub Actions or when no BENCH line was
+    captured (e.g. inference tests, --skipsim runs).
+    """
+    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if not summary_path:
+        return
+    m = _TRAIN_BENCH_RE.search(stdout or "")
+    if not m:
+        return
+    train_cycles, opt_cycles, weight_sram = m.group(1), m.group(2), m.group(3)
+    db_flag = "DB" if "--doublebuffer" in (config.gen_args or []) else "SB"
+    l1 = "—"
+    for arg in config.gen_args or []:
+        if arg.startswith("--l1="):
+            l1 = arg.split("=", 1)[1]
+            break
+    try:
+        with open(summary_path, "a") as f:
+            if metric_section not in _METRIC_SECTIONS_WRITTEN:
+                f.write(f"\n## {metric_section}\n\n")
+                f.write("| Test | L1 (B) | Mode | train_cycles | opt_cycles | weight_sram |\n")
+                f.write("|------|--------|------|--------------|------------|-------------|\n")
+                _METRIC_SECTIONS_WRITTEN.add(metric_section)
+            f.write(f"| {test_name} | {l1} | {db_flag} | {int(train_cycles):,} | {int(opt_cycles):,} | "
+                    f"{int(weight_sram):,} |\n")
+    except Exception:
+        # Best-effort: never let summary IO failure mask a real test result.
+        pass
+
+
+def run_and_assert_test(test_name: str,
+                        config: DeeployTestConfig,
+                        skipgen: bool,
+                        skipsim: bool,
+                        metric_section: Optional[str] = None) -> None:
     """
     Shared helper function to run a test and assert its results.
 
+    When `metric_section` is non-None and $GITHUB_STEP_SUMMARY is set, append
+    a cycle-count row to that Markdown section so reviewers can see SB-vs-DB
+    deltas directly in the workflow summary panel.
+
     Raises:
         AssertionError: If test fails or has errors
     """
@@ -136,3 +190,6 @@ def run_and_assert_test(test_name: str, config: DeeployTestConfig, skipgen: bool
 
     if result.error_count >= 0:
         assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests")
+
+    if metric_section:
+        _emit_training_cycle_row(test_name, config, result.stdout, metric_section)
diff --git a/DeeployTest/testUtils/tilingUtils.py b/DeeployTest/testUtils/tilingUtils.py
index 1dfb43be..8b96810a 100644
--- a/DeeployTest/testUtils/tilingUtils.py
+++ b/DeeployTest/testUtils/tilingUtils.py
@@ -4,6 +4,7 @@
 
 from typing import Dict, List, Tuple, Union
 
+import numpy as np
 from ortools.constraint_solver.pywrapcp import IntVar
 
 from Deeploy.DeeployTypes import NetworkContext, SubGraph, TransientBuffer
@@ -13,6 +14,17 @@
 from Deeploy.TilingExtension.TilerModel import TilerModel
 
 
+def _isScalarBuffer(ctxt: NetworkContext, tensorName: str) -> bool:
+    """A scalar tensor (product of dims == 1) cannot be split across two
+    physical L1 slots; double-buffering it would just waste L1 and trip the
+    DB pass's `_hoistMultibufferReferences` shape assertion. Treat as SB.
+    """
+    shape = ctxt.lookup(tensorName).shape
+    if isinstance(shape, int):
+        return shape <= 1
+    return int(np.prod(shape)) <= 1
+
+
 class DBOnlyL3Tiler(Tiler):
 
     def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
@@ -37,6 +49,9 @@ def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, patt
         if isinstance(buffer, TransientBuffer):
             return 1
 
+        if _isScalarBuffer(ctxt, tensorName):
+            return 1
+
         return 2
 
 
@@ -69,3 +84,69 @@ def _calculateLifetimes(self, ctxt: NetworkContext, patternMemoryConstraint: Pat
 
 class TrainingSBTiler(SBTiler):
     memorySchedulerClass = TrainingMemoryScheduler
+
+
+class TrainingDBTiler(DBTiler):
+    memorySchedulerClass = TrainingMemoryScheduler
+
+    # Operators where DB cannot fall through the scalar-pattern check below.
+    # All other "previously opted out" ops (SoftmaxCrossEntropyLoss, MSELoss,
+    # MSELossGrad, Gemm) are now handled by the scalar-pattern check — they
+    # all have a scalar tensor (loss, lazy_reset_grad) somewhere in their
+    # pattern that triggers the fall-back to SB.
+    DB_OPT_OUT_OPS = frozenset({
+        # In-place alias outputs (output is _alias'd to an input). DB's
+        # per-tensor multibuffer hoist would split the alias across two L1
+        # slots and break in-place semantics. Note: InPlaceAccumulatorV2
+        # also has the lazy_reset_grad scalar, but we keep it explicit
+        # because the alias semantics are the primary concern.
+        "SGD",
+        "InPlaceAccumulatorV2",
+        # SoftmaxCrossEntropyLossGrad's output_grad is consumed by 2 backward
+        # Gemms (multi-consumer intermediate) — DB's per-consumer hoist
+        # inflates _users and breaks MemoryAllocation _live tracking.
+        # Tracked separately; needs a real fix in the DB pass / _users
+        # accounting rather than an opt-out.
+        "SoftmaxCrossEntropyLossGrad",
+    })
+
+    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
+                            hop: str, tensorName: str) -> Union[int, IntVar]:
+        # If this pattern contains an opt-out op, force SB for every tensor in
+        # this pattern. The DB pass then sees offsetList of length 1 and
+        # returns applicable=False — SB.apply (run before DB.apply) has
+        # already produced correct code for the pattern.
+        for node in pattern:
+            if node.op in self.DB_OPT_OUT_OPS:
+                return 1
+        # If ANY tensor in this pattern is scalar (product-of-dims <= 1),
+        # force coefficient=1 for the WHOLE pattern. Otherwise we end up
+        # with mixed coefficients (scalar=1, non-scalar=2) — neither
+        # SB.apply (needs all=1) nor DB.apply (needs all=2) is applicable
+        # and the codegen degenerates to a bare kernel call with NO DMA
+        # setup, so the kernel reads stale L1 data. This was the real
+        # cause of the "autoencoder weights frozen" symptom previously
+        # mis-attributed to Gemm: MSELoss's scalar `loss` output triggered
+        # this degenerate case.
+        for node in pattern:
+            for tensor in list(node.inputs) + list(node.outputs):
+                tname = tensor.name
+                if ctxt.is_buffer(tname) and _isScalarBuffer(ctxt, tname):
+                    return 1
+        return super().multiBufferStrategy(tilerModel, ctxt, pattern, path, hop, tensorName)
+
+
+class TrainingDBOnlyL3Tiler(TrainingDBTiler):
+    """L3-mode training DB: double-buffer only the L3→L2 hop, leave L2→L1 SB.
+
+    Mirrors the inference path's `DBOnlyL3Tiler`. Plain `TrainingDBTiler` doubles
+    every memory hop's coefficient — for `defaultMemLevel=L3` that means L2
+    staging buffers also get doubled, which blows the 2 MB L2 capacity on
+    ResNet8/MobileNetV1 training graphs (constraint solver returns infeasible).
+    """
+
+    def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str],
+                            hop: str, tensorName: str) -> Union[int, IntVar]:
+        if hop == "L1":
+            return 1
+        return super().multiBufferStrategy(tilerModel, ctxt, pattern, path, hop, tensorName)
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 7eee2085..2184857c 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -35,10 +35,13 @@
 from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS as NEUREKA_L3_DOUBLEBUFFER_MODELS
 from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS_WMEM as NEUREKA_L3_DOUBLEBUFFER_MODELS_WMEM
 from test_siracusa_neureka_tiled_config import L3_SINGLEBUFFER_MODELS as NEUREKA_L3_SINGLEBUFFER_MODELS
-from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS, L2_SINGLEBUFFER_KERNELS, \
-    L2_SINGLEBUFFER_MODELS
+from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS
+from test_siracusa_tiled_config import L2_DOUBLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_DOUBLEBUFFER_TRAINING_MODELS
+from test_siracusa_tiled_config import L2_SINGLEBUFFER_KERNELS, L2_SINGLEBUFFER_MODELS
 from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS
-from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS
+from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS
+from test_siracusa_tiled_config import L3_DOUBLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_DOUBLEBUFFER_TRAINING_MODELS
+from test_siracusa_tiled_config import L3_SINGLEBUFFER_MODELS
 from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS
 from test_siracusa_tiled_config import TRAINING_MODEL_OVERRIDES as SIRACUSA_TRAINING_MODEL_OVERRIDES
 from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES
@@ -376,7 +379,7 @@ def test_siracusa_tiled_training_l2_singlebuffer(test_params, deeploy_test_dir,
         training_num_data_inputs = overrides.get("num_data_inputs"),
         training_tolerance = overrides.get("tolerance"),
     )
-    run_and_assert_test(test_name, config, skipgen, skipsim)
+    run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L2 training cycles")
 
 
 @pytest.mark.siracusa_tiled
@@ -410,7 +413,75 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir,
         training_num_data_inputs = overrides.get("num_data_inputs"),
         training_tolerance = overrides.get("tolerance"),
     )
-    run_and_assert_test(test_name, config, skipgen, skipsim)
+    run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L3 training cycles")
+
+
+@pytest.mark.siracusa_tiled
+@pytest.mark.training
+@pytest.mark.doublebuffer
+@pytest.mark.l3
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(SIRACUSA_L3_DOUBLEBUFFER_TRAINING_MODELS, "L3-doublebuffer-training"),
+    ids = param_id,
+)
+def test_siracusa_tiled_training_l3_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
+                                                 skipgen, skipsim) -> None:
+    test_name, l1, _config_name = test_params
+    overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = l1,
+        l2 = 2000000,
+        default_mem_level = "L3",
+        double_buffer = True,
+        training = True,
+        training_num_data_inputs = overrides.get("num_data_inputs"),
+        training_tolerance = overrides.get("tolerance"),
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L3 training cycles")
+
+
+@pytest.mark.siracusa_tiled
+@pytest.mark.training
+@pytest.mark.doublebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(SIRACUSA_L2_DOUBLEBUFFER_TRAINING_MODELS, "L2-doublebuffer-training"),
+    ids = param_id,
+)
+def test_siracusa_tiled_training_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args,
+                                                 skipgen, skipsim) -> None:
+    test_name, l1, _config_name = test_params
+    overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {})
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = SIRACUSA_DEFAULT_CORES,
+        l1 = l1,
+        l2 = 2000000,
+        default_mem_level = "L2",
+        double_buffer = True,
+        training = True,
+        training_num_data_inputs = overrides.get("num_data_inputs"),
+        training_tolerance = overrides.get("tolerance"),
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L2 training cycles")
 
 
 @pytest.mark.siracusa_tiled
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index bafa6635..06e67f71 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -159,7 +159,9 @@
 # L2 size is fixed by the runner at 2_000_000 to match the validated local run.
 L2_SINGLEBUFFER_TRAINING_MODELS = {
     "Models/Training/SimpleMLP/simplemlp_train": [64000],
-    "Models/Training/Autoencoder/autoencoder_train": [128000],
+    # 32 KB variant matches the L2 DB matrix so the SB/DB join table in
+    # the workflow summary actually pairs up.
+    "Models/Training/Autoencoder/autoencoder_train": [128000, 32000],
     "Models/Training/DSCNN/dscnn_train": [128000, 64000],
 }
 
@@ -172,6 +174,28 @@
     "Models/Training/CCT_LoRA/cct_lora_train": [128000],
 }
 
+# Double-buffered training models. Start narrow: only SimpleMLP until DB+alias
+# path is validated end-to-end. Expand to Autoencoder/DSCNN once stable.
+# L2 DB at L1=128 KB → almost all ops are 1-tile (tensors fit comfortably);
+# DB pass triggers but has nothing to pipeline. Add a 32 KB autoencoder
+# variant so ~8 of 55 ops become 2-4 tiles and DB pipelining actually
+# fires. DSCNN is structurally DB-unfriendly at L2 (depthwise/pointwise
+# Conv weights are tiny, only ~1 of 97 ops multi-tiles even at L1=16 KB).
+L2_DOUBLEBUFFER_TRAINING_MODELS = {
+    "Models/Training/SimpleMLP/simplemlp_train": [64000],
+    "Models/Training/Autoencoder/autoencoder_train": [128000, 32000],
+    "Models/Training/DSCNN/dscnn_train": [128000],
+}
+
+# L3 DB training: only DB the L3↔L2 hop (TrainingDBOnlyL3Tiler) so the L2
+# staging budget doesn't double. CCT/CCT_LoRA left out — their backward
+# alias graph still trips MemoryAllocation _live tracking even with our
+# opt-out blacklist (a separate follow-up).
+L3_DOUBLEBUFFER_TRAINING_MODELS = {
+    "Models/Training/ResNet8/resnet8_train": [128000],
+    "Models/Training/MobileNetV1/mobilenetv1_train": [128000],
+}
+
 # Per-model overrides for training tests.
 #
 # - num_data_inputs: required when inputs.npz has only one mini-batch (no