diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml index b65cbb75..a038d5b3 100644 --- a/.github/workflows/ci-platform-siracusa-tiled.yml +++ b/.github/workflows/ci-platform-siracusa-tiled.yml @@ -29,20 +29,22 @@ jobs: with: docker_image_deeploy: ${{ inputs.docker_image_deeploy }} - # Training tests - L2 singlebuffer - siracusa-training-tiled-l2-singlebuffer: + # Training tests - L2 (SB + DB combined so the runner emits a single + # SB-vs-DB cycle comparison table to $GITHUB_STEP_SUMMARY). + siracusa-training-tiled-l2: needs: select-env uses: ./.github/workflows/_runner-siracusa-tiled.yml with: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "training and l2 and singlebuffer" + pytest-marker: "training and l2" - # Training tests - L3 singlebuffer (models that spill weights/activations to L3) - siracusa-training-tiled-l3-singlebuffer: + # Training tests - L3 (SB + DB combined; DB uses TrainingDBOnlyL3Tiler so + # the L2 staging budget doesn't double). + siracusa-training-tiled-l3: needs: select-env uses: ./.github/workflows/_runner-siracusa-tiled.yml with: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "training and l3 and singlebuffer" + pytest-marker: "training and l3" diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml index 7a4f415e..839af4b4 100644 --- a/.github/workflows/ci-platform-siracusa.yml +++ b/.github/workflows/ci-platform-siracusa.yml @@ -6,12 +6,9 @@ name: CI • Siracusa "on": - push: - branches: - - "**" - tags: - - "v*.*.*" - pull_request: + # Auto-trigger disabled in TrainDeeploy fork: untiled Siracusa is not + # exercised by training/DB work. Re-enable by restoring the push: / + # pull_request: blocks. workflow_dispatch: inputs: docker_image_deeploy: diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py index f29891bf..c247053d 100644 --- a/DeeployTest/conftest.py +++ b/DeeployTest/conftest.py @@ -158,3 +158,81 @@ def toolchain(request): def cmake_args(request): """Return additional CMake arguments.""" return request.config.getoption("--cmake-args") + + +# --------------------------------------------------------------------------- +# Training cycle summary: at session end, scan $GITHUB_STEP_SUMMARY for any +# training cycle section emitted by run_and_assert_test, join SB and DB rows +# by (test, l1), and append a comparison table with speedup. +# --------------------------------------------------------------------------- +def _parse_training_section(section_body: str): + """Parse rows of `| test | l1 | mode | train_cycles | opt_cycles | weight_sram |`. + + Returns list of dicts with keys: test, l1, mode, train, opt, sram. + """ + rows = [] + for line in section_body.splitlines(): + if not line.startswith("| "): + continue + if "train_cycles" in line or "------" in line: + continue + cells = [c.strip() for c in line.strip("|").split("|")] + if len(cells) < 6: + continue + try: + train = int(cells[3].replace(",", "")) + opt = int(cells[4].replace(",", "")) + sram = int(cells[5].replace(",", "")) + except ValueError: + continue + rows.append({"test": cells[0], "l1": cells[1], "mode": cells[2], "train": train, "opt": opt, "sram": sram}) + return rows + + +def pytest_terminal_summary(terminalreporter, exitstatus, config): + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if not summary_path or not os.path.exists(summary_path): + return + try: + with open(summary_path, "r") as f: + existing = f.read() + except Exception: + return + + # Find every "## Siracusa L? training cycles" section and append a join. + import re as _re + for heading in _re.findall(r"^## (Siracusa L[23] training cycles)$", existing, flags = _re.MULTILINE): + start = existing.find(f"## {heading}") + rest = existing[start + len(f"## {heading}"):] + next_section = rest.find("\n## ") + body = rest if next_section == -1 else rest[:next_section] + rows = _parse_training_section(body) + if not rows: + continue + # Join SB and DB rows by (test, l1). + by_key: dict = {} + for r in rows: + by_key.setdefault((r["test"], r["l1"]), {})[r["mode"]] = r + try: + with open(summary_path, "a") as f: + f.write(f"\n### {heading} — SB vs DB speedup\n\n") + f.write("| Test | L1 (B) | SB train | DB train | train Δ | SB opt | DB opt | opt Δ |\n") + f.write("|------|--------|----------|----------|---------|--------|--------|-------|\n") + for (test, l1), modes in sorted(by_key.items()): + sb = modes.get("SB") + db = modes.get("DB") + sb_t = f"{sb['train']:,}" if sb else "—" + db_t = f"{db['train']:,}" if db else "—" + sb_o = f"{sb['opt']:,}" if sb else "—" + db_o = f"{db['opt']:,}" if db else "—" + if sb and db and sb['train'] > 0: + delta_t = f"{(sb['train'] - db['train']) / sb['train'] * 100:+.1f}%" + else: + delta_t = "—" + if sb and db and sb['opt'] > 0: + delta_o = f"{(sb['opt'] - db['opt']) / sb['opt'] * 100:+.1f}%" + else: + delta_o = "—" + f.write(f"| {test} | {l1} | {sb_t} | {db_t} | {delta_t} | {sb_o} | {db_o} | {delta_o} |\n") + except Exception: + pass diff --git a/DeeployTest/testMVPOptimizer.py b/DeeployTest/testMVPOptimizer.py index 0d6497f8..0668aea6 100644 --- a/DeeployTest/testMVPOptimizer.py +++ b/DeeployTest/testMVPOptimizer.py @@ -34,7 +34,7 @@ from testUtils.codeGenerateTraining import build_shared_buffer_maps, generateOptimizerTestNetwork from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform from testUtils.testRunner import TestGeneratorArgumentParser -from testUtils.tilingUtils import TrainingSBTiler +from testUtils.tilingUtils import TrainingDBTiler, TrainingSBTiler from testUtils.trainingUtils import _mockScheduler, add_optimizer_training_dir_arg from Deeploy.AbstractDataTypes import PointerClass @@ -102,15 +102,17 @@ def generateTiledOptimizerNetwork(args) -> None: AnnotateDefaultMemoryLevel(memoryHierarchy), ]) - # 7. Wrap with SBTiler (single-buffering; optimizer is forward-only, no lifetime extension needed). - unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer" + # 7. Wrap with tiler. SB by default; --doublebuffer switches to TrainingDBTiler. + unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer_DB{args.doublebuffer}" testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16] - # TrainingSBTiler extends all input buffer lifetimes to the end of the - # schedule (via TrainingMemoryScheduler). This prevents the allocator from - # reusing the space of a consumed input (e.g. fc1 weight) for a later - # output (e.g. fc2 updated weight), which would corrupt the weight buffer. - deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir) + # TrainingSBTiler/TrainingDBTiler extend all input buffer lifetimes to the + # end of the schedule (via TrainingMemoryScheduler). This prevents the + # allocator from reusing the space of a consumed input (e.g. fc1 weight) + # for a later output (e.g. fc2 updated weight), which would corrupt the + # weight buffer. + tilerCls = TrainingDBTiler if args.doublebuffer else TrainingSBTiler + deployer = TilerDeployerWrapper(deployer, tilerCls, testName = testIdentifier, workDir = args.dumpdir) deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc deployer.tiler.memoryAllocStrategy = args.memAllocStrategy deployer.tiler.searchStrategy = args.searchStrategy @@ -159,6 +161,9 @@ def generateTiledOptimizerNetwork(args) -> None: type = str, default = "L2", help = "Default memory level for IO buffers. Default: L2.") + parser.add_argument("--doublebuffer", + action = "store_true", + help = "Enable double buffering for tile DMA transfers (TrainingDBTiler).") parser.add_argument("--memAllocStrategy", type = str, default = "MiniMalloc", diff --git a/DeeployTest/testMVPTraining.py b/DeeployTest/testMVPTraining.py index bf69b090..32ba10bf 100644 --- a/DeeployTest/testMVPTraining.py +++ b/DeeployTest/testMVPTraining.py @@ -13,7 +13,7 @@ from testUtils.codeGenerateTraining import generateTrainingTestNetwork from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform from testUtils.testRunner import TestGeneratorArgumentParser -from testUtils.tilingUtils import TrainingSBTiler +from testUtils.tilingUtils import TrainingDBOnlyL3Tiler, TrainingDBTiler, TrainingSBTiler from testUtils.trainingUtils import _GRAD_ACC, _infer_data_size, _infer_n_accum, _infer_num_data_inputs, \ _infer_total_mb, _load_reference_losses, _mockScheduler, add_training_inference_args from testUtils.typeMapping import inferTypeAndOffset @@ -132,11 +132,16 @@ def generateTiledTrainingNetwork(args) -> None: AnnotateDefaultMemoryLevel(memoryHierarchy), ]) - # 9. Wrap with tiler (TrainingSBTiler: SB strategy + extended input lifetimes for backward pass). - unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}" + # 9. Wrap with tiler. SB by default; --doublebuffer switches to TrainingDBTiler + # (DB strategy + same TrainingMemoryScheduler input-lifetime extension). + unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_DB{args.doublebuffer}" testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16] - deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir) + if args.doublebuffer: + tilerCls = TrainingDBOnlyL3Tiler if args.defaultMemLevel == "L3" else TrainingDBTiler + else: + tilerCls = TrainingSBTiler + deployer = TilerDeployerWrapper(deployer, tilerCls, testName = testIdentifier, workDir = args.dumpdir) deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc deployer.tiler.memoryAllocStrategy = args.memAllocStrategy deployer.tiler.searchStrategy = args.searchStrategy @@ -245,6 +250,9 @@ def generateTiledTrainingNetwork(args) -> None: type = str, default = "L2", help = "Default memory level for IO buffers. Default: L2.") + parser.add_argument("--doublebuffer", + action = "store_true", + help = "Enable double buffering for tile DMA transfers (TrainingDBTiler).") parser.add_argument("--memAllocStrategy", type = str, default = "MiniMalloc", diff --git a/DeeployTest/testUtils/pytestRunner.py b/DeeployTest/testUtils/pytestRunner.py index cedc8b7a..960b3031 100644 --- a/DeeployTest/testUtils/pytestRunner.py +++ b/DeeployTest/testUtils/pytestRunner.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import os +import re from pathlib import Path from typing import List, Literal, Optional @@ -17,6 +18,16 @@ 'run_simulation', ] +# Tracks which Markdown sections we've already emitted a header for inside the +# current pytest session. Keeps run_and_assert_test idempotent across +# parametrised cases that share a section. +_METRIC_SECTIONS_WRITTEN: set = set() + +# `BENCH train_cycles= opt_cycles= weight_sram=` — printed once per +# training run by the test harness; captured here so we can append a cycles +# row to $GITHUB_STEP_SUMMARY for SB-vs-DB comparison. +_TRAIN_BENCH_RE = re.compile(r"BENCH train_cycles=(\d+) opt_cycles=(\d+) weight_sram=(\d+)") + def get_worker_id() -> str: """ @@ -122,10 +133,53 @@ def create_test_config( return config -def run_and_assert_test(test_name: str, config: DeeployTestConfig, skipgen: bool, skipsim: bool) -> None: +def _emit_training_cycle_row(test_name: str, config: DeeployTestConfig, stdout: str, metric_section: str) -> None: + """Parse `BENCH train_cycles=...` from the test's stdout and append a row + to $GITHUB_STEP_SUMMARY under `## {metric_section}`. The header is emitted + once per (section, session) pair via _METRIC_SECTIONS_WRITTEN. + + No-op when not running under GitHub Actions or when no BENCH line was + captured (e.g. inference tests, --skipsim runs). + """ + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if not summary_path: + return + m = _TRAIN_BENCH_RE.search(stdout or "") + if not m: + return + train_cycles, opt_cycles, weight_sram = m.group(1), m.group(2), m.group(3) + db_flag = "DB" if "--doublebuffer" in (config.gen_args or []) else "SB" + l1 = "—" + for arg in config.gen_args or []: + if arg.startswith("--l1="): + l1 = arg.split("=", 1)[1] + break + try: + with open(summary_path, "a") as f: + if metric_section not in _METRIC_SECTIONS_WRITTEN: + f.write(f"\n## {metric_section}\n\n") + f.write("| Test | L1 (B) | Mode | train_cycles | opt_cycles | weight_sram |\n") + f.write("|------|--------|------|--------------|------------|-------------|\n") + _METRIC_SECTIONS_WRITTEN.add(metric_section) + f.write(f"| {test_name} | {l1} | {db_flag} | {int(train_cycles):,} | {int(opt_cycles):,} | " + f"{int(weight_sram):,} |\n") + except Exception: + # Best-effort: never let summary IO failure mask a real test result. + pass + + +def run_and_assert_test(test_name: str, + config: DeeployTestConfig, + skipgen: bool, + skipsim: bool, + metric_section: Optional[str] = None) -> None: """ Shared helper function to run a test and assert its results. + When `metric_section` is non-None and $GITHUB_STEP_SUMMARY is set, append + a cycle-count row to that Markdown section so reviewers can see SB-vs-DB + deltas directly in the workflow summary panel. + Raises: AssertionError: If test fails or has errors """ @@ -136,3 +190,6 @@ def run_and_assert_test(test_name: str, config: DeeployTestConfig, skipgen: bool if result.error_count >= 0: assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests") + + if metric_section: + _emit_training_cycle_row(test_name, config, result.stdout, metric_section) diff --git a/DeeployTest/testUtils/tilingUtils.py b/DeeployTest/testUtils/tilingUtils.py index 1dfb43be..8b96810a 100644 --- a/DeeployTest/testUtils/tilingUtils.py +++ b/DeeployTest/testUtils/tilingUtils.py @@ -4,6 +4,7 @@ from typing import Dict, List, Tuple, Union +import numpy as np from ortools.constraint_solver.pywrapcp import IntVar from Deeploy.DeeployTypes import NetworkContext, SubGraph, TransientBuffer @@ -13,6 +14,17 @@ from Deeploy.TilingExtension.TilerModel import TilerModel +def _isScalarBuffer(ctxt: NetworkContext, tensorName: str) -> bool: + """A scalar tensor (product of dims == 1) cannot be split across two + physical L1 slots; double-buffering it would just waste L1 and trip the + DB pass's `_hoistMultibufferReferences` shape assertion. Treat as SB. + """ + shape = ctxt.lookup(tensorName).shape + if isinstance(shape, int): + return shape <= 1 + return int(np.prod(shape)) <= 1 + + class DBOnlyL3Tiler(Tiler): def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], @@ -37,6 +49,9 @@ def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, patt if isinstance(buffer, TransientBuffer): return 1 + if _isScalarBuffer(ctxt, tensorName): + return 1 + return 2 @@ -69,3 +84,69 @@ def _calculateLifetimes(self, ctxt: NetworkContext, patternMemoryConstraint: Pat class TrainingSBTiler(SBTiler): memorySchedulerClass = TrainingMemoryScheduler + + +class TrainingDBTiler(DBTiler): + memorySchedulerClass = TrainingMemoryScheduler + + # Operators where DB cannot fall through the scalar-pattern check below. + # All other "previously opted out" ops (SoftmaxCrossEntropyLoss, MSELoss, + # MSELossGrad, Gemm) are now handled by the scalar-pattern check — they + # all have a scalar tensor (loss, lazy_reset_grad) somewhere in their + # pattern that triggers the fall-back to SB. + DB_OPT_OUT_OPS = frozenset({ + # In-place alias outputs (output is _alias'd to an input). DB's + # per-tensor multibuffer hoist would split the alias across two L1 + # slots and break in-place semantics. Note: InPlaceAccumulatorV2 + # also has the lazy_reset_grad scalar, but we keep it explicit + # because the alias semantics are the primary concern. + "SGD", + "InPlaceAccumulatorV2", + # SoftmaxCrossEntropyLossGrad's output_grad is consumed by 2 backward + # Gemms (multi-consumer intermediate) — DB's per-consumer hoist + # inflates _users and breaks MemoryAllocation _live tracking. + # Tracked separately; needs a real fix in the DB pass / _users + # accounting rather than an opt-out. + "SoftmaxCrossEntropyLossGrad", + }) + + def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], + hop: str, tensorName: str) -> Union[int, IntVar]: + # If this pattern contains an opt-out op, force SB for every tensor in + # this pattern. The DB pass then sees offsetList of length 1 and + # returns applicable=False — SB.apply (run before DB.apply) has + # already produced correct code for the pattern. + for node in pattern: + if node.op in self.DB_OPT_OUT_OPS: + return 1 + # If ANY tensor in this pattern is scalar (product-of-dims <= 1), + # force coefficient=1 for the WHOLE pattern. Otherwise we end up + # with mixed coefficients (scalar=1, non-scalar=2) — neither + # SB.apply (needs all=1) nor DB.apply (needs all=2) is applicable + # and the codegen degenerates to a bare kernel call with NO DMA + # setup, so the kernel reads stale L1 data. This was the real + # cause of the "autoencoder weights frozen" symptom previously + # mis-attributed to Gemm: MSELoss's scalar `loss` output triggered + # this degenerate case. + for node in pattern: + for tensor in list(node.inputs) + list(node.outputs): + tname = tensor.name + if ctxt.is_buffer(tname) and _isScalarBuffer(ctxt, tname): + return 1 + return super().multiBufferStrategy(tilerModel, ctxt, pattern, path, hop, tensorName) + + +class TrainingDBOnlyL3Tiler(TrainingDBTiler): + """L3-mode training DB: double-buffer only the L3→L2 hop, leave L2→L1 SB. + + Mirrors the inference path's `DBOnlyL3Tiler`. Plain `TrainingDBTiler` doubles + every memory hop's coefficient — for `defaultMemLevel=L3` that means L2 + staging buffers also get doubled, which blows the 2 MB L2 capacity on + ResNet8/MobileNetV1 training graphs (constraint solver returns infeasible). + """ + + def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], + hop: str, tensorName: str) -> Union[int, IntVar]: + if hop == "L1": + return 1 + return super().multiBufferStrategy(tilerModel, ctxt, pattern, path, hop, tensorName) diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 7eee2085..2184857c 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -35,10 +35,13 @@ from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS as NEUREKA_L3_DOUBLEBUFFER_MODELS from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS_WMEM as NEUREKA_L3_DOUBLEBUFFER_MODELS_WMEM from test_siracusa_neureka_tiled_config import L3_SINGLEBUFFER_MODELS as NEUREKA_L3_SINGLEBUFFER_MODELS -from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS, L2_SINGLEBUFFER_KERNELS, \ - L2_SINGLEBUFFER_MODELS +from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS +from test_siracusa_tiled_config import L2_DOUBLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_DOUBLEBUFFER_TRAINING_MODELS +from test_siracusa_tiled_config import L2_SINGLEBUFFER_KERNELS, L2_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS -from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS +from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS +from test_siracusa_tiled_config import L3_DOUBLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_DOUBLEBUFFER_TRAINING_MODELS +from test_siracusa_tiled_config import L3_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS from test_siracusa_tiled_config import TRAINING_MODEL_OVERRIDES as SIRACUSA_TRAINING_MODEL_OVERRIDES from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES @@ -376,7 +379,7 @@ def test_siracusa_tiled_training_l2_singlebuffer(test_params, deeploy_test_dir, training_num_data_inputs = overrides.get("num_data_inputs"), training_tolerance = overrides.get("tolerance"), ) - run_and_assert_test(test_name, config, skipgen, skipsim) + run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L2 training cycles") @pytest.mark.siracusa_tiled @@ -410,7 +413,75 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir, training_num_data_inputs = overrides.get("num_data_inputs"), training_tolerance = overrides.get("tolerance"), ) - run_and_assert_test(test_name, config, skipgen, skipsim) + run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L3 training cycles") + + +@pytest.mark.siracusa_tiled +@pytest.mark.training +@pytest.mark.doublebuffer +@pytest.mark.l3 +@pytest.mark.parametrize( + "test_params", + generate_test_params(SIRACUSA_L3_DOUBLEBUFFER_TRAINING_MODELS, "L3-doublebuffer-training"), + ids = param_id, +) +def test_siracusa_tiled_training_l3_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, + skipgen, skipsim) -> None: + test_name, l1, _config_name = test_params + overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) + config = create_test_config( + test_name = test_name, + platform = "Siracusa", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = True, + cores = SIRACUSA_DEFAULT_CORES, + l1 = l1, + l2 = 2000000, + default_mem_level = "L3", + double_buffer = True, + training = True, + training_num_data_inputs = overrides.get("num_data_inputs"), + training_tolerance = overrides.get("tolerance"), + ) + run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L3 training cycles") + + +@pytest.mark.siracusa_tiled +@pytest.mark.training +@pytest.mark.doublebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(SIRACUSA_L2_DOUBLEBUFFER_TRAINING_MODELS, "L2-doublebuffer-training"), + ids = param_id, +) +def test_siracusa_tiled_training_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, + skipgen, skipsim) -> None: + test_name, l1, _config_name = test_params + overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) + config = create_test_config( + test_name = test_name, + platform = "Siracusa", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = True, + cores = SIRACUSA_DEFAULT_CORES, + l1 = l1, + l2 = 2000000, + default_mem_level = "L2", + double_buffer = True, + training = True, + training_num_data_inputs = overrides.get("num_data_inputs"), + training_tolerance = overrides.get("tolerance"), + ) + run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L2 training cycles") @pytest.mark.siracusa_tiled diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index bafa6635..06e67f71 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -159,7 +159,9 @@ # L2 size is fixed by the runner at 2_000_000 to match the validated local run. L2_SINGLEBUFFER_TRAINING_MODELS = { "Models/Training/SimpleMLP/simplemlp_train": [64000], - "Models/Training/Autoencoder/autoencoder_train": [128000], + # 32 KB variant matches the L2 DB matrix so the SB/DB join table in + # the workflow summary actually pairs up. + "Models/Training/Autoencoder/autoencoder_train": [128000, 32000], "Models/Training/DSCNN/dscnn_train": [128000, 64000], } @@ -172,6 +174,28 @@ "Models/Training/CCT_LoRA/cct_lora_train": [128000], } +# Double-buffered training models. Start narrow: only SimpleMLP until DB+alias +# path is validated end-to-end. Expand to Autoencoder/DSCNN once stable. +# L2 DB at L1=128 KB → almost all ops are 1-tile (tensors fit comfortably); +# DB pass triggers but has nothing to pipeline. Add a 32 KB autoencoder +# variant so ~8 of 55 ops become 2-4 tiles and DB pipelining actually +# fires. DSCNN is structurally DB-unfriendly at L2 (depthwise/pointwise +# Conv weights are tiny, only ~1 of 97 ops multi-tiles even at L1=16 KB). +L2_DOUBLEBUFFER_TRAINING_MODELS = { + "Models/Training/SimpleMLP/simplemlp_train": [64000], + "Models/Training/Autoencoder/autoencoder_train": [128000, 32000], + "Models/Training/DSCNN/dscnn_train": [128000], +} + +# L3 DB training: only DB the L3↔L2 hop (TrainingDBOnlyL3Tiler) so the L2 +# staging budget doesn't double. CCT/CCT_LoRA left out — their backward +# alias graph still trips MemoryAllocation _live tracking even with our +# opt-out blacklist (a separate follow-up). +L3_DOUBLEBUFFER_TRAINING_MODELS = { + "Models/Training/ResNet8/resnet8_train": [128000], + "Models/Training/MobileNetV1/mobilenetv1_train": [128000], +} + # Per-model overrides for training tests. # # - num_data_inputs: required when inputs.npz has only one mini-batch (no