runwangdl · runwangdl · May 10, 2026 · May 10, 2026 · May 10, 2026 · May 10, 2026
@@ -29,20 +29,22 @@ jobs:
     with:
       docker_image_deeploy: ${{ inputs.docker_image_deeploy }}
 
-  # Training tests - L2 singlebuffer
-  siracusa-training-tiled-l2-singlebuffer:
+  # Training tests - L2 (SB + DB combined so the runner emits a single
+  # SB-vs-DB cycle comparison table to $GITHUB_STEP_SUMMARY).
+  siracusa-training-tiled-l2:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
     with:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "training and l2 and singlebuffer"
+      pytest-marker: "training and l2"
 
-  # Training tests - L3 singlebuffer (models that spill weights/activations to L3)
-  siracusa-training-tiled-l3-singlebuffer:
+  # Training tests - L3 (SB + DB combined; DB uses TrainingDBOnlyL3Tiler so
+  # the L2 staging budget doesn't double).
+  siracusa-training-tiled-l3:
     needs: select-env
     uses: ./.github/workflows/_runner-siracusa-tiled.yml
     with:
       runner: ${{ needs.select-env.outputs.runner }}
       docker-image: ${{ needs.select-env.outputs.image }}
-      pytest-marker: "training and l3 and singlebuffer"
+      pytest-marker: "training and l3"
@@ -6,12 +6,9 @@
 name: CI • Siracusa
 
 "on":
-  push:
-    branches:
-      - "**"
-    tags:
-      - "v*.*.*"
-  pull_request:
+  # Auto-trigger disabled in TrainDeeploy fork: untiled Siracusa is not
+  # exercised by training/DB work. Re-enable by restoring the push: /
+  # pull_request: blocks.
   workflow_dispatch:
     inputs:
       docker_image_deeploy:

@@ -158,3 +158,81 @@ def toolchain(request):
 def cmake_args(request):
     """Return additional CMake arguments."""
     return request.config.getoption("--cmake-args")
+
+
+# ---------------------------------------------------------------------------
+# Training cycle summary: at session end, scan $GITHUB_STEP_SUMMARY for any
+# training cycle section emitted by run_and_assert_test, join SB and DB rows
+# by (test, l1), and append a comparison table with speedup.
+# ---------------------------------------------------------------------------
+def _parse_training_section(section_body: str):
+    """Parse rows of `| test | l1 | mode | train_cycles | opt_cycles | weight_sram |`.
+
+    Returns list of dicts with keys: test, l1, mode, train, opt, sram.
+    """
+    rows = []
+    for line in section_body.splitlines():
+        if not line.startswith("| "):
+            continue
+        if "train_cycles" in line or "------" in line:
+            continue
+        cells = [c.strip() for c in line.strip("|").split("|")]
+        if len(cells) < 6:
+            continue
+        try:
+            train = int(cells[3].replace(",", ""))
+            opt = int(cells[4].replace(",", ""))
+            sram = int(cells[5].replace(",", ""))
+        except ValueError:
+            continue
+        rows.append({"test": cells[0], "l1": cells[1], "mode": cells[2], "train": train, "opt": opt, "sram": sram})
+    return rows
+
+
+def pytest_terminal_summary(terminalreporter, exitstatus, config):
+    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if not summary_path or not os.path.exists(summary_path):
+        return
+    try:
+        with open(summary_path, "r") as f:
+            existing = f.read()
+    except Exception:
+        return
+
+    # Find every "## Siracusa L? training cycles" section and append a join.
+    import re as _re
+    for heading in _re.findall(r"^## (Siracusa L[23] training cycles)$", existing, flags = _re.MULTILINE):
+        start = existing.find(f"## {heading}")
+        rest = existing[start + len(f"## {heading}"):]
+        next_section = rest.find("\n## ")
+        body = rest if next_section == -1 else rest[:next_section]
+        rows = _parse_training_section(body)
+        if not rows:
+            continue
+        # Join SB and DB rows by (test, l1).
+        by_key: dict = {}
+        for r in rows:
+            by_key.setdefault((r["test"], r["l1"]), {})[r["mode"]] = r
+        try:
+            with open(summary_path, "a") as f:
+                f.write(f"\n### {heading} — SB vs DB speedup\n\n")
+                f.write("| Test | L1 (B) | SB train | DB train | train Δ | SB opt | DB opt | opt Δ |\n")
+                f.write("|------|--------|----------|----------|---------|--------|--------|-------|\n")
+                for (test, l1), modes in sorted(by_key.items()):
+                    sb = modes.get("SB")
+                    db = modes.get("DB")
+                    sb_t = f"{sb['train']:,}" if sb else "—"
+                    db_t = f"{db['train']:,}" if db else "—"
+                    sb_o = f"{sb['opt']:,}" if sb else "—"
+                    db_o = f"{db['opt']:,}" if db else "—"
+                    if sb and db and sb['train'] > 0:
+                        delta_t = f"{(sb['train'] - db['train']) / sb['train'] * 100:+.1f}%"
+                    else:
+                        delta_t = "—"
+                    if sb and db and sb['opt'] > 0:
+                        delta_o = f"{(sb['opt'] - db['opt']) / sb['opt'] * 100:+.1f}%"
+                    else:
+                        delta_o = "—"
+                    f.write(f"| {test} | {l1} | {sb_t} | {db_t} | {delta_t} | {sb_o} | {db_o} | {delta_o} |\n")
+        except Exception:
+            pass
@@ -34,7 +34,7 @@
 from testUtils.codeGenerateTraining import build_shared_buffer_maps, generateOptimizerTestNetwork
 from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser
-from testUtils.tilingUtils import TrainingSBTiler
+from testUtils.tilingUtils import TrainingDBTiler, TrainingSBTiler
 from testUtils.trainingUtils import _mockScheduler, add_optimizer_training_dir_arg
 
 from Deeploy.AbstractDataTypes import PointerClass
@@ -102,15 +102,17 @@ def generateTiledOptimizerNetwork(args) -> None:
         AnnotateDefaultMemoryLevel(memoryHierarchy),
     ])
 
-    # 7. Wrap with SBTiler (single-buffering; optimizer is forward-only, no lifetime extension needed).
-    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer"
+    # 7. Wrap with tiler. SB by default; --doublebuffer switches to TrainingDBTiler.
+    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer_DB{args.doublebuffer}"
     testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16]
 
-    # TrainingSBTiler extends all input buffer lifetimes to the end of the
-    # schedule (via TrainingMemoryScheduler).  This prevents the allocator from
-    # reusing the space of a consumed input (e.g. fc1 weight) for a later
-    # output (e.g. fc2 updated weight), which would corrupt the weight buffer.
-    deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir)
+    # TrainingSBTiler/TrainingDBTiler extend all input buffer lifetimes to the
+    # end of the schedule (via TrainingMemoryScheduler).  This prevents the
+    # allocator from reusing the space of a consumed input (e.g. fc1 weight)
+    # for a later output (e.g. fc2 updated weight), which would corrupt the
+    # weight buffer.
+    tilerCls = TrainingDBTiler if args.doublebuffer else TrainingSBTiler
+    deployer = TilerDeployerWrapper(deployer, tilerCls, testName = testIdentifier, workDir = args.dumpdir)
     deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc
     deployer.tiler.memoryAllocStrategy = args.memAllocStrategy
     deployer.tiler.searchStrategy = args.searchStrategy
@@ -159,6 +161,9 @@ def generateTiledOptimizerNetwork(args) -> None:
                         type = str,
                         default = "L2",
                         help = "Default memory level for IO buffers. Default: L2.")
+    parser.add_argument("--doublebuffer",
+                        action = "store_true",
+                        help = "Enable double buffering for tile DMA transfers (TrainingDBTiler).")
     parser.add_argument("--memAllocStrategy",
                         type = str,
                         default = "MiniMalloc",

@@ -13,7 +13,7 @@
 from testUtils.codeGenerateTraining import generateTrainingTestNetwork
 from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform
 from testUtils.testRunner import TestGeneratorArgumentParser
-from testUtils.tilingUtils import TrainingSBTiler
+from testUtils.tilingUtils import TrainingDBOnlyL3Tiler, TrainingDBTiler, TrainingSBTiler
 from testUtils.trainingUtils import _GRAD_ACC, _infer_data_size, _infer_n_accum, _infer_num_data_inputs, \
     _infer_total_mb, _load_reference_losses, _mockScheduler, add_training_inference_args
 from testUtils.typeMapping import inferTypeAndOffset
@@ -132,11 +132,16 @@ def generateTiledTrainingNetwork(args) -> None:
         AnnotateDefaultMemoryLevel(memoryHierarchy),
     ])
 
-    # 9. Wrap with tiler (TrainingSBTiler: SB strategy + extended input lifetimes for backward pass).
-    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}"
+    # 9. Wrap with tiler. SB by default; --doublebuffer switches to TrainingDBTiler
+    #    (DB strategy + same TrainingMemoryScheduler input-lifetime extension).
+    unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_DB{args.doublebuffer}"
     testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16]
 
-    deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir)
+    if args.doublebuffer:
+        tilerCls = TrainingDBOnlyL3Tiler if args.defaultMemLevel == "L3" else TrainingDBTiler
+    else:
+        tilerCls = TrainingSBTiler
+    deployer = TilerDeployerWrapper(deployer, tilerCls, testName = testIdentifier, workDir = args.dumpdir)
     deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc
     deployer.tiler.memoryAllocStrategy = args.memAllocStrategy
     deployer.tiler.searchStrategy = args.searchStrategy
@@ -245,6 +250,9 @@ def generateTiledTrainingNetwork(args) -> None:
                         type = str,
                         default = "L2",
                         help = "Default memory level for IO buffers. Default: L2.")
+    parser.add_argument("--doublebuffer",
+                        action = "store_true",
+                        help = "Enable double buffering for tile DMA transfers (TrainingDBTiler).")
     parser.add_argument("--memAllocStrategy",
                         type = str,
                         default = "MiniMalloc",

@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import os
+import re
 from pathlib import Path
 from typing import List, Literal, Optional
 
@@ -17,6 +18,16 @@
     'run_simulation',
 ]
 
+# Tracks which Markdown sections we've already emitted a header for inside the
+# current pytest session. Keeps run_and_assert_test idempotent across
+# parametrised cases that share a section.
+_METRIC_SECTIONS_WRITTEN: set = set()
+
+# `BENCH train_cycles=<N> opt_cycles=<M> weight_sram=<K>` — printed once per
+# training run by the test harness; captured here so we can append a cycles
+# row to $GITHUB_STEP_SUMMARY for SB-vs-DB comparison.
+_TRAIN_BENCH_RE = re.compile(r"BENCH train_cycles=(\d+) opt_cycles=(\d+) weight_sram=(\d+)")
+
 
 def get_worker_id() -> str:
     """
@@ -122,10 +133,53 @@ def create_test_config(
     return config
 
 
-def run_and_assert_test(test_name: str, config: DeeployTestConfig, skipgen: bool, skipsim: bool) -> None:
+def _emit_training_cycle_row(test_name: str, config: DeeployTestConfig, stdout: str, metric_section: str) -> None:
+    """Parse `BENCH train_cycles=...` from the test's stdout and append a row
+    to $GITHUB_STEP_SUMMARY under `## {metric_section}`. The header is emitted
+    once per (section, session) pair via _METRIC_SECTIONS_WRITTEN.
+
+    No-op when not running under GitHub Actions or when no BENCH line was
+    captured (e.g. inference tests, --skipsim runs).
+    """
+    summary_path = os.environ.get("GITHUB_STEP_SUMMARY")
+    if not summary_path:
+        return
+    m = _TRAIN_BENCH_RE.search(stdout or "")
+    if not m:
+        return
+    train_cycles, opt_cycles, weight_sram = m.group(1), m.group(2), m.group(3)
+    db_flag = "DB" if "--doublebuffer" in (config.gen_args or []) else "SB"
+    l1 = "—"
+    for arg in config.gen_args or []:
+        if arg.startswith("--l1="):
+            l1 = arg.split("=", 1)[1]
+            break
+    try:
+        with open(summary_path, "a") as f:
+            if metric_section not in _METRIC_SECTIONS_WRITTEN:
+                f.write(f"\n## {metric_section}\n\n")
+                f.write("| Test | L1 (B) | Mode | train_cycles | opt_cycles | weight_sram |\n")
+                f.write("|------|--------|------|--------------|------------|-------------|\n")
+                _METRIC_SECTIONS_WRITTEN.add(metric_section)
+            f.write(f"| {test_name} | {l1} | {db_flag} | {int(train_cycles):,} | {int(opt_cycles):,} | "
+                    f"{int(weight_sram):,} |\n")
+    except Exception:
+        # Best-effort: never let summary IO failure mask a real test result.
+        pass
+
+
+def run_and_assert_test(test_name: str,
+                        config: DeeployTestConfig,
+                        skipgen: bool,
+                        skipsim: bool,
+                        metric_section: Optional[str] = None) -> None:
     """
     Shared helper function to run a test and assert its results.
 
+    When `metric_section` is non-None and $GITHUB_STEP_SUMMARY is set, append
+    a cycle-count row to that Markdown section so reviewers can see SB-vs-DB
+    deltas directly in the workflow summary panel.
+
     Raises:
         AssertionError: If test fails or has errors
     """
@@ -136,3 +190,6 @@ def run_and_assert_test(test_name: str, config: DeeployTestConfig, skipgen: bool
 
     if result.error_count >= 0:
         assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests")
+
+    if metric_section:
+        _emit_training_cycle_row(test_name, config, result.stdout, metric_section)