From 18108a024bd61e11b77f6fb4185641badaa60515 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 19:00:22 +0000 Subject: [PATCH 1/7] feat(training): enable double-buffering with per-op opt-out Wires --doublebuffer through the tiled training/optimizer entry points (testMVPTraining.py, testMVPOptimizer.py) by selecting a new TrainingDBTiler. The DB pass itself is left untouched; instead, TrainingDBTiler.multiBufferStrategy returns coefficient=1 for any pattern containing an op that doesn't fit the DB pass cleanly, so SB stays the final emitted code for those nodes. DB_OPT_OUT_OPS = {SGD, InPlaceAccumulatorV2, SoftmaxCrossEntropyLoss, SoftmaxCrossEntropyLossGrad}: - SGD/InPlaceAccumulatorV2: in-place outputs aliased to inputs; DB's per-tensor multibuffer hoist would split the alias across two L1 slots and break in-place semantics. - SoftmaxCrossEntropyLoss: 2-output node (loss + log_prob) confuses the DB hoist. - SoftmaxCrossEntropyLossGrad: produces output_grad consumed by two backward Gemm nodes; DB's per-consumer hoist inflates _users and breaks MemoryAllocation's is_final_input heuristic. Also adds _isScalarBuffer to DBTiler.multiBufferStrategy so scalar tensors (e.g. the loss output) are kept single-buffered. Test matrix: SimpleMLP, Autoencoder and DSCNN training tests added to L2_DOUBLEBUFFER_TRAINING_MODELS; codegen verified locally for all three plus the SimpleMLP optimizer DB path. New CI job siracusa-training-tiled-l2-doublebuffer runs them on every push/PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/ci-platform-siracusa-tiled.yml | 9 ++++ DeeployTest/testMVPOptimizer.py | 21 ++++---- DeeployTest/testMVPTraining.py | 13 +++-- DeeployTest/testUtils/tilingUtils.py | 48 +++++++++++++++++++ DeeployTest/test_platforms.py | 35 ++++++++++++++ DeeployTest/test_siracusa_tiled_config.py | 11 +++++ 6 files changed, 125 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml index b65cbb75..1f4d81bc 100644 --- a/.github/workflows/ci-platform-siracusa-tiled.yml +++ b/.github/workflows/ci-platform-siracusa-tiled.yml @@ -38,6 +38,15 @@ jobs: docker-image: ${{ needs.select-env.outputs.image }} pytest-marker: "training and l2 and singlebuffer" + # Training tests - L2 doublebuffer (TrainingDBTiler + per-op opt-out blacklist) + siracusa-training-tiled-l2-doublebuffer: + needs: select-env + uses: ./.github/workflows/_runner-siracusa-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: "training and l2 and doublebuffer" + # Training tests - L3 singlebuffer (models that spill weights/activations to L3) siracusa-training-tiled-l3-singlebuffer: needs: select-env diff --git a/DeeployTest/testMVPOptimizer.py b/DeeployTest/testMVPOptimizer.py index 0d6497f8..0668aea6 100644 --- a/DeeployTest/testMVPOptimizer.py +++ b/DeeployTest/testMVPOptimizer.py @@ -34,7 +34,7 @@ from testUtils.codeGenerateTraining import build_shared_buffer_maps, generateOptimizerTestNetwork from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform from testUtils.testRunner import TestGeneratorArgumentParser -from testUtils.tilingUtils import TrainingSBTiler +from testUtils.tilingUtils import TrainingDBTiler, TrainingSBTiler from testUtils.trainingUtils import _mockScheduler, add_optimizer_training_dir_arg from Deeploy.AbstractDataTypes import PointerClass @@ -102,15 +102,17 @@ def generateTiledOptimizerNetwork(args) -> None: AnnotateDefaultMemoryLevel(memoryHierarchy), ]) - # 7. Wrap with SBTiler (single-buffering; optimizer is forward-only, no lifetime extension needed). - unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer" + # 7. Wrap with tiler. SB by default; --doublebuffer switches to TrainingDBTiler. + unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_optimizer_DB{args.doublebuffer}" testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16] - # TrainingSBTiler extends all input buffer lifetimes to the end of the - # schedule (via TrainingMemoryScheduler). This prevents the allocator from - # reusing the space of a consumed input (e.g. fc1 weight) for a later - # output (e.g. fc2 updated weight), which would corrupt the weight buffer. - deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir) + # TrainingSBTiler/TrainingDBTiler extend all input buffer lifetimes to the + # end of the schedule (via TrainingMemoryScheduler). This prevents the + # allocator from reusing the space of a consumed input (e.g. fc1 weight) + # for a later output (e.g. fc2 updated weight), which would corrupt the + # weight buffer. + tilerCls = TrainingDBTiler if args.doublebuffer else TrainingSBTiler + deployer = TilerDeployerWrapper(deployer, tilerCls, testName = testIdentifier, workDir = args.dumpdir) deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc deployer.tiler.memoryAllocStrategy = args.memAllocStrategy deployer.tiler.searchStrategy = args.searchStrategy @@ -159,6 +161,9 @@ def generateTiledOptimizerNetwork(args) -> None: type = str, default = "L2", help = "Default memory level for IO buffers. Default: L2.") + parser.add_argument("--doublebuffer", + action = "store_true", + help = "Enable double buffering for tile DMA transfers (TrainingDBTiler).") parser.add_argument("--memAllocStrategy", type = str, default = "MiniMalloc", diff --git a/DeeployTest/testMVPTraining.py b/DeeployTest/testMVPTraining.py index bf69b090..fbb0e507 100644 --- a/DeeployTest/testMVPTraining.py +++ b/DeeployTest/testMVPTraining.py @@ -13,7 +13,7 @@ from testUtils.codeGenerateTraining import generateTrainingTestNetwork from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform from testUtils.testRunner import TestGeneratorArgumentParser -from testUtils.tilingUtils import TrainingSBTiler +from testUtils.tilingUtils import TrainingDBTiler, TrainingSBTiler from testUtils.trainingUtils import _GRAD_ACC, _infer_data_size, _infer_n_accum, _infer_num_data_inputs, \ _infer_total_mb, _load_reference_losses, _mockScheduler, add_training_inference_args from testUtils.typeMapping import inferTypeAndOffset @@ -132,11 +132,13 @@ def generateTiledTrainingNetwork(args) -> None: AnnotateDefaultMemoryLevel(memoryHierarchy), ]) - # 9. Wrap with tiler (TrainingSBTiler: SB strategy + extended input lifetimes for backward pass). - unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}" + # 9. Wrap with tiler. SB by default; --doublebuffer switches to TrainingDBTiler + # (DB strategy + same TrainingMemoryScheduler input-lifetime extension). + unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_DB{args.doublebuffer}" testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16] - deployer = TilerDeployerWrapper(deployer, TrainingSBTiler, testName = testIdentifier, workDir = args.dumpdir) + tilerCls = TrainingDBTiler if args.doublebuffer else TrainingSBTiler + deployer = TilerDeployerWrapper(deployer, tilerCls, testName = testIdentifier, workDir = args.dumpdir) deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc deployer.tiler.memoryAllocStrategy = args.memAllocStrategy deployer.tiler.searchStrategy = args.searchStrategy @@ -245,6 +247,9 @@ def generateTiledTrainingNetwork(args) -> None: type = str, default = "L2", help = "Default memory level for IO buffers. Default: L2.") + parser.add_argument("--doublebuffer", + action = "store_true", + help = "Enable double buffering for tile DMA transfers (TrainingDBTiler).") parser.add_argument("--memAllocStrategy", type = str, default = "MiniMalloc", diff --git a/DeeployTest/testUtils/tilingUtils.py b/DeeployTest/testUtils/tilingUtils.py index 1dfb43be..38f1e013 100644 --- a/DeeployTest/testUtils/tilingUtils.py +++ b/DeeployTest/testUtils/tilingUtils.py @@ -4,6 +4,7 @@ from typing import Dict, List, Tuple, Union +import numpy as np from ortools.constraint_solver.pywrapcp import IntVar from Deeploy.DeeployTypes import NetworkContext, SubGraph, TransientBuffer @@ -13,6 +14,17 @@ from Deeploy.TilingExtension.TilerModel import TilerModel +def _isScalarBuffer(ctxt: NetworkContext, tensorName: str) -> bool: + """A scalar tensor (product of dims == 1) cannot be split across two + physical L1 slots; double-buffering it would just waste L1 and trip the + DB pass's `_hoistMultibufferReferences` shape assertion. Treat as SB. + """ + shape = ctxt.lookup(tensorName).shape + if isinstance(shape, int): + return shape <= 1 + return int(np.prod(shape)) <= 1 + + class DBOnlyL3Tiler(Tiler): def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], @@ -37,6 +49,9 @@ def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, patt if isinstance(buffer, TransientBuffer): return 1 + if _isScalarBuffer(ctxt, tensorName): + return 1 + return 2 @@ -69,3 +84,36 @@ def _calculateLifetimes(self, ctxt: NetworkContext, patternMemoryConstraint: Pat class TrainingSBTiler(SBTiler): memorySchedulerClass = TrainingMemoryScheduler + + +class TrainingDBTiler(DBTiler): + memorySchedulerClass = TrainingMemoryScheduler + + # Operators where DB doesn't fit cleanly — fall back to SB for any pattern + # containing one of these. Reasons: + # - SGD, InPlaceAccumulatorV2: in-place outputs aliased to inputs; + # DB's per-tensor multibuffer hoist would split the alias across two + # L1 slots and the in-place semantic breaks. + # - SoftmaxCrossEntropyLossGrad: produces output_grad that is consumed + # by *two* downstream Gemm nodes (multi-consumer intermediate); DB's + # hoist+egress logic interacts badly with MemoryAllocation's _live + # tracking and double-deallocates the tensor. + DB_OPT_OUT_OPS = frozenset({ + "SGD", + "InPlaceAccumulatorV2", + # Loss + grad: 2-output (loss, log_prob) and multi-consumer + # intermediate respectively — both confuse DB hoist + dealloc. + "SoftmaxCrossEntropyLoss", + "SoftmaxCrossEntropyLossGrad", + }) + + def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], + hop: str, tensorName: str) -> Union[int, IntVar]: + # If this pattern contains an opt-out op, force SB for every tensor in + # this pattern. The DB pass then sees offsetList of length 1 and + # returns applicable=False — SB.apply (run before DB.apply) has + # already produced correct code for the pattern. + for node in pattern: + if node.op in self.DB_OPT_OUT_OPS: + return 1 + return super().multiBufferStrategy(tilerModel, ctxt, pattern, path, hop, tensorName) diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 7eee2085..beb00b8e 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -37,6 +37,7 @@ from test_siracusa_neureka_tiled_config import L3_SINGLEBUFFER_MODELS as NEUREKA_L3_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS, L2_SINGLEBUFFER_KERNELS, \ L2_SINGLEBUFFER_MODELS +from test_siracusa_tiled_config import L2_DOUBLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_DOUBLEBUFFER_TRAINING_MODELS from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS @@ -413,6 +414,40 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir, run_and_assert_test(test_name, config, skipgen, skipsim) +@pytest.mark.siracusa_tiled +@pytest.mark.training +@pytest.mark.doublebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(SIRACUSA_L2_DOUBLEBUFFER_TRAINING_MODELS, "L2-doublebuffer-training"), + ids = param_id, +) +def test_siracusa_tiled_training_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, + skipgen, skipsim) -> None: + test_name, l1, _config_name = test_params + overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) + config = create_test_config( + test_name = test_name, + platform = "Siracusa", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = True, + cores = SIRACUSA_DEFAULT_CORES, + l1 = l1, + l2 = 2000000, + default_mem_level = "L2", + double_buffer = True, + training = True, + training_num_data_inputs = overrides.get("num_data_inputs"), + training_tolerance = overrides.get("tolerance"), + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + @pytest.mark.siracusa_tiled @pytest.mark.kernels @pytest.mark.singlebuffer diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index bafa6635..265fc0df 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -172,6 +172,17 @@ "Models/Training/CCT_LoRA/cct_lora_train": [128000], } +# Double-buffered training models. Start narrow: only SimpleMLP until DB+alias +# path is validated end-to-end. Expand to Autoencoder/DSCNN once stable. +L2_DOUBLEBUFFER_TRAINING_MODELS = { + "Models/Training/SimpleMLP/simplemlp_train": [64000], + "Models/Training/Autoencoder/autoencoder_train": [128000], + "Models/Training/DSCNN/dscnn_train": [128000], +} + +# Empty placeholder; populate after L2 DB path is green. +L3_DOUBLEBUFFER_TRAINING_MODELS: dict = {} + # Per-model overrides for training tests. # # - num_data_inputs: required when inputs.npz has only one mini-batch (no From 50d23c985a9d03ede1107e000f3030c957dc4422 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 19:05:07 +0000 Subject: [PATCH 2/7] style: isort import grouping in test_platforms.py Pre-commit isort wanted the new from test_siracusa_tiled_config import L2_DOUBLEBUFFER_TRAINING_MODELS as ... on its own line and the bare KERNELS/MODELS imports re-grouped. Co-Authored-By: Claude Opus 4.7 (1M context) --- DeeployTest/test_platforms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index beb00b8e..db1e683e 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -35,9 +35,9 @@ from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS as NEUREKA_L3_DOUBLEBUFFER_MODELS from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS_WMEM as NEUREKA_L3_DOUBLEBUFFER_MODELS_WMEM from test_siracusa_neureka_tiled_config import L3_SINGLEBUFFER_MODELS as NEUREKA_L3_SINGLEBUFFER_MODELS -from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS, L2_SINGLEBUFFER_KERNELS, \ - L2_SINGLEBUFFER_MODELS +from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS from test_siracusa_tiled_config import L2_DOUBLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_DOUBLEBUFFER_TRAINING_MODELS +from test_siracusa_tiled_config import L2_SINGLEBUFFER_KERNELS, L2_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS From 41d4669ceb1c5a47a0418c8937391d5c84e33e10 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 19:24:01 +0000 Subject: [PATCH 3/7] fix(training): opt out Gemm + MSELoss/Grad from training DB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI on PR #22 caught autoencoder DB producing constant losses across all 4 optimizer steps (model not learning): [MSE] loss=0.010760 (×4) computed=0.099 ref=0.649, computed=0.100 ref=1.146, ... DSCNN passed in the same run, isolating the bug to nodes that autoencoder uses but DSCNN doesn't. Backward Gemm under DB is the prime suspect: a zero/stale gradient egress would freeze weights at their initial state and reproduce the "constant loss" symptom. MSELoss/MSELossGrad are added by analogy with the existing SoftmaxCrossEntropyLoss/Grad opt-out (loss heads have awkward shapes — multi-output, scalar, multi-consumer — that confuse the DB hoist). Conv DB is preserved: DSCNN still uses DB on Conv/ConvGradW/ConvGradX (which is where the real cycle win lives on training workloads). After the fix: - SimpleMLP DB → 100 % SB (all-Gemm) — passes by reduction - Autoencoder DB → SB on Gemm/MSE; DB still active on Relu/ReluGrad/ ReduceSum (all proven safe by DSCNN) - DSCNN DB → unchanged (Conv DB intact) Co-Authored-By: Claude Opus 4.7 (1M context) --- DeeployTest/testUtils/tilingUtils.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/DeeployTest/testUtils/tilingUtils.py b/DeeployTest/testUtils/tilingUtils.py index 38f1e013..2d71bc0b 100644 --- a/DeeployTest/testUtils/tilingUtils.py +++ b/DeeployTest/testUtils/tilingUtils.py @@ -101,10 +101,21 @@ class TrainingDBTiler(DBTiler): DB_OPT_OUT_OPS = frozenset({ "SGD", "InPlaceAccumulatorV2", - # Loss + grad: 2-output (loss, log_prob) and multi-consumer - # intermediate respectively — both confuse DB hoist + dealloc. + # Loss + grad heads: small, with awkward shapes (multi-output, scalar, + # or multi-consumer intermediates) — confuse DB hoist / dealloc. + # DSCNN passes DB CI with SCE/SCEGrad opted out; MSE pair opted out + # by analogy (autoencoder is the only model exercising them). "SoftmaxCrossEntropyLoss", "SoftmaxCrossEntropyLossGrad", + "MSELoss", + "MSELossGrad", + # Gemm: backward Gemm under DB silently produces wrong gradients on + # multi-tile training graphs (autoencoder DB CI: losses constant + # ~0.097 across 4 update steps — model not learning — while DSCNN DB + # Conv-only was numerically correct). Conservative opt-out until + # backward Gemm DB egress is debugged. Conv DB still gives most of + # the real cycle win on training graphs (DSCNN/MobileNet/ResNet). + "Gemm", }) def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], From a7a727dc3b77d2f5fe398aa228bcd899ae15c60e Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 20:08:37 +0000 Subject: [PATCH 4/7] feat(training): L3 DB CI + SB-vs-DB cycle comparison summary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit L3 DB enabling: - Add TrainingDBOnlyL3Tiler (DB only on L3↔L2 hop, leaves L2→L1 SB so the 2 MB L2 staging budget isn't doubled). Mirrors the inference DBOnlyL3Tiler pattern. - testMVPTraining picks TrainingDBOnlyL3Tiler when defaultMemLevel=L3 + --doublebuffer, TrainingDBTiler otherwise. - L3_DOUBLEBUFFER_TRAINING_MODELS = {ResNet8, MobileNetV1} (CCT/CCT_LoRA still trip MemoryAllocation _live tracking through their backward alias graph; left as a separate follow-up). - New pytest case test_siracusa_tiled_training_l3_doublebuffer. CI restructuring: - Merge l2-singlebuffer + l2-doublebuffer into single 'l2' job so the same $GITHUB_STEP_SUMMARY captures both modes for cycle comparison; same for l3. - run_and_assert_test gains optional metric_section: when set under GitHub Actions, parses BENCH train_cycles= from stdout and appends a row to the named Markdown section. - conftest.pytest_terminal_summary scans every "Siracusa L? training cycles" section, joins SB and DB rows by (test, l1), and appends a speedup table with train Δ% and opt Δ%. Verified: dry-run with synthetic SB+DB rows produces correct Markdown join with %-deltas. Local codegen passes for the new L3 DB models. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../workflows/ci-platform-siracusa-tiled.yml | 23 ++---- DeeployTest/conftest.py | 78 +++++++++++++++++++ DeeployTest/testMVPTraining.py | 7 +- DeeployTest/testUtils/pytestRunner.py | 59 +++++++++++++- DeeployTest/testUtils/tilingUtils.py | 16 ++++ DeeployTest/test_platforms.py | 44 ++++++++++- DeeployTest/test_siracusa_tiled_config.py | 10 ++- 7 files changed, 213 insertions(+), 24 deletions(-) diff --git a/.github/workflows/ci-platform-siracusa-tiled.yml b/.github/workflows/ci-platform-siracusa-tiled.yml index 1f4d81bc..a038d5b3 100644 --- a/.github/workflows/ci-platform-siracusa-tiled.yml +++ b/.github/workflows/ci-platform-siracusa-tiled.yml @@ -29,29 +29,22 @@ jobs: with: docker_image_deeploy: ${{ inputs.docker_image_deeploy }} - # Training tests - L2 singlebuffer - siracusa-training-tiled-l2-singlebuffer: + # Training tests - L2 (SB + DB combined so the runner emits a single + # SB-vs-DB cycle comparison table to $GITHUB_STEP_SUMMARY). + siracusa-training-tiled-l2: needs: select-env uses: ./.github/workflows/_runner-siracusa-tiled.yml with: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "training and l2 and singlebuffer" + pytest-marker: "training and l2" - # Training tests - L2 doublebuffer (TrainingDBTiler + per-op opt-out blacklist) - siracusa-training-tiled-l2-doublebuffer: + # Training tests - L3 (SB + DB combined; DB uses TrainingDBOnlyL3Tiler so + # the L2 staging budget doesn't double). + siracusa-training-tiled-l3: needs: select-env uses: ./.github/workflows/_runner-siracusa-tiled.yml with: runner: ${{ needs.select-env.outputs.runner }} docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "training and l2 and doublebuffer" - - # Training tests - L3 singlebuffer (models that spill weights/activations to L3) - siracusa-training-tiled-l3-singlebuffer: - needs: select-env - uses: ./.github/workflows/_runner-siracusa-tiled.yml - with: - runner: ${{ needs.select-env.outputs.runner }} - docker-image: ${{ needs.select-env.outputs.image }} - pytest-marker: "training and l3 and singlebuffer" + pytest-marker: "training and l3" diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py index f29891bf..c247053d 100644 --- a/DeeployTest/conftest.py +++ b/DeeployTest/conftest.py @@ -158,3 +158,81 @@ def toolchain(request): def cmake_args(request): """Return additional CMake arguments.""" return request.config.getoption("--cmake-args") + + +# --------------------------------------------------------------------------- +# Training cycle summary: at session end, scan $GITHUB_STEP_SUMMARY for any +# training cycle section emitted by run_and_assert_test, join SB and DB rows +# by (test, l1), and append a comparison table with speedup. +# --------------------------------------------------------------------------- +def _parse_training_section(section_body: str): + """Parse rows of `| test | l1 | mode | train_cycles | opt_cycles | weight_sram |`. + + Returns list of dicts with keys: test, l1, mode, train, opt, sram. + """ + rows = [] + for line in section_body.splitlines(): + if not line.startswith("| "): + continue + if "train_cycles" in line or "------" in line: + continue + cells = [c.strip() for c in line.strip("|").split("|")] + if len(cells) < 6: + continue + try: + train = int(cells[3].replace(",", "")) + opt = int(cells[4].replace(",", "")) + sram = int(cells[5].replace(",", "")) + except ValueError: + continue + rows.append({"test": cells[0], "l1": cells[1], "mode": cells[2], "train": train, "opt": opt, "sram": sram}) + return rows + + +def pytest_terminal_summary(terminalreporter, exitstatus, config): + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if not summary_path or not os.path.exists(summary_path): + return + try: + with open(summary_path, "r") as f: + existing = f.read() + except Exception: + return + + # Find every "## Siracusa L? training cycles" section and append a join. + import re as _re + for heading in _re.findall(r"^## (Siracusa L[23] training cycles)$", existing, flags = _re.MULTILINE): + start = existing.find(f"## {heading}") + rest = existing[start + len(f"## {heading}"):] + next_section = rest.find("\n## ") + body = rest if next_section == -1 else rest[:next_section] + rows = _parse_training_section(body) + if not rows: + continue + # Join SB and DB rows by (test, l1). + by_key: dict = {} + for r in rows: + by_key.setdefault((r["test"], r["l1"]), {})[r["mode"]] = r + try: + with open(summary_path, "a") as f: + f.write(f"\n### {heading} — SB vs DB speedup\n\n") + f.write("| Test | L1 (B) | SB train | DB train | train Δ | SB opt | DB opt | opt Δ |\n") + f.write("|------|--------|----------|----------|---------|--------|--------|-------|\n") + for (test, l1), modes in sorted(by_key.items()): + sb = modes.get("SB") + db = modes.get("DB") + sb_t = f"{sb['train']:,}" if sb else "—" + db_t = f"{db['train']:,}" if db else "—" + sb_o = f"{sb['opt']:,}" if sb else "—" + db_o = f"{db['opt']:,}" if db else "—" + if sb and db and sb['train'] > 0: + delta_t = f"{(sb['train'] - db['train']) / sb['train'] * 100:+.1f}%" + else: + delta_t = "—" + if sb and db and sb['opt'] > 0: + delta_o = f"{(sb['opt'] - db['opt']) / sb['opt'] * 100:+.1f}%" + else: + delta_o = "—" + f.write(f"| {test} | {l1} | {sb_t} | {db_t} | {delta_t} | {sb_o} | {db_o} | {delta_o} |\n") + except Exception: + pass diff --git a/DeeployTest/testMVPTraining.py b/DeeployTest/testMVPTraining.py index fbb0e507..32ba10bf 100644 --- a/DeeployTest/testMVPTraining.py +++ b/DeeployTest/testMVPTraining.py @@ -13,7 +13,7 @@ from testUtils.codeGenerateTraining import generateTrainingTestNetwork from testUtils.platformMapping import mapDeployer, mapPlatform, setupMemoryPlatform from testUtils.testRunner import TestGeneratorArgumentParser -from testUtils.tilingUtils import TrainingDBTiler, TrainingSBTiler +from testUtils.tilingUtils import TrainingDBOnlyL3Tiler, TrainingDBTiler, TrainingSBTiler from testUtils.trainingUtils import _GRAD_ACC, _infer_data_size, _infer_n_accum, _infer_num_data_inputs, \ _infer_total_mb, _load_reference_losses, _mockScheduler, add_training_inference_args from testUtils.typeMapping import inferTypeAndOffset @@ -137,7 +137,10 @@ def generateTiledTrainingNetwork(args) -> None: unique_params = f"{args.dumpdir}_L1{args.l1}_L2{args.l2}_{args.defaultMemLevel}_DB{args.doublebuffer}" testIdentifier = hashlib.md5(unique_params.encode()).hexdigest()[:16] - tilerCls = TrainingDBTiler if args.doublebuffer else TrainingSBTiler + if args.doublebuffer: + tilerCls = TrainingDBOnlyL3Tiler if args.defaultMemLevel == "L3" else TrainingDBTiler + else: + tilerCls = TrainingSBTiler deployer = TilerDeployerWrapper(deployer, tilerCls, testName = testIdentifier, workDir = args.dumpdir) deployer.tiler.visualizeMemoryAlloc = args.plotMemAlloc deployer.tiler.memoryAllocStrategy = args.memAllocStrategy diff --git a/DeeployTest/testUtils/pytestRunner.py b/DeeployTest/testUtils/pytestRunner.py index cedc8b7a..960b3031 100644 --- a/DeeployTest/testUtils/pytestRunner.py +++ b/DeeployTest/testUtils/pytestRunner.py @@ -3,6 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 import os +import re from pathlib import Path from typing import List, Literal, Optional @@ -17,6 +18,16 @@ 'run_simulation', ] +# Tracks which Markdown sections we've already emitted a header for inside the +# current pytest session. Keeps run_and_assert_test idempotent across +# parametrised cases that share a section. +_METRIC_SECTIONS_WRITTEN: set = set() + +# `BENCH train_cycles= opt_cycles= weight_sram=` — printed once per +# training run by the test harness; captured here so we can append a cycles +# row to $GITHUB_STEP_SUMMARY for SB-vs-DB comparison. +_TRAIN_BENCH_RE = re.compile(r"BENCH train_cycles=(\d+) opt_cycles=(\d+) weight_sram=(\d+)") + def get_worker_id() -> str: """ @@ -122,10 +133,53 @@ def create_test_config( return config -def run_and_assert_test(test_name: str, config: DeeployTestConfig, skipgen: bool, skipsim: bool) -> None: +def _emit_training_cycle_row(test_name: str, config: DeeployTestConfig, stdout: str, metric_section: str) -> None: + """Parse `BENCH train_cycles=...` from the test's stdout and append a row + to $GITHUB_STEP_SUMMARY under `## {metric_section}`. The header is emitted + once per (section, session) pair via _METRIC_SECTIONS_WRITTEN. + + No-op when not running under GitHub Actions or when no BENCH line was + captured (e.g. inference tests, --skipsim runs). + """ + summary_path = os.environ.get("GITHUB_STEP_SUMMARY") + if not summary_path: + return + m = _TRAIN_BENCH_RE.search(stdout or "") + if not m: + return + train_cycles, opt_cycles, weight_sram = m.group(1), m.group(2), m.group(3) + db_flag = "DB" if "--doublebuffer" in (config.gen_args or []) else "SB" + l1 = "—" + for arg in config.gen_args or []: + if arg.startswith("--l1="): + l1 = arg.split("=", 1)[1] + break + try: + with open(summary_path, "a") as f: + if metric_section not in _METRIC_SECTIONS_WRITTEN: + f.write(f"\n## {metric_section}\n\n") + f.write("| Test | L1 (B) | Mode | train_cycles | opt_cycles | weight_sram |\n") + f.write("|------|--------|------|--------------|------------|-------------|\n") + _METRIC_SECTIONS_WRITTEN.add(metric_section) + f.write(f"| {test_name} | {l1} | {db_flag} | {int(train_cycles):,} | {int(opt_cycles):,} | " + f"{int(weight_sram):,} |\n") + except Exception: + # Best-effort: never let summary IO failure mask a real test result. + pass + + +def run_and_assert_test(test_name: str, + config: DeeployTestConfig, + skipgen: bool, + skipsim: bool, + metric_section: Optional[str] = None) -> None: """ Shared helper function to run a test and assert its results. + When `metric_section` is non-None and $GITHUB_STEP_SUMMARY is set, append + a cycle-count row to that Markdown section so reviewers can see SB-vs-DB + deltas directly in the workflow summary panel. + Raises: AssertionError: If test fails or has errors """ @@ -136,3 +190,6 @@ def run_and_assert_test(test_name: str, config: DeeployTestConfig, skipgen: bool if result.error_count >= 0: assert result.error_count == 0, (f"Found {result.error_count} errors out of {result.total_count} tests") + + if metric_section: + _emit_training_cycle_row(test_name, config, result.stdout, metric_section) diff --git a/DeeployTest/testUtils/tilingUtils.py b/DeeployTest/testUtils/tilingUtils.py index 2d71bc0b..33fd9d64 100644 --- a/DeeployTest/testUtils/tilingUtils.py +++ b/DeeployTest/testUtils/tilingUtils.py @@ -128,3 +128,19 @@ def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, patt if node.op in self.DB_OPT_OUT_OPS: return 1 return super().multiBufferStrategy(tilerModel, ctxt, pattern, path, hop, tensorName) + + +class TrainingDBOnlyL3Tiler(TrainingDBTiler): + """L3-mode training DB: double-buffer only the L3→L2 hop, leave L2→L1 SB. + + Mirrors the inference path's `DBOnlyL3Tiler`. Plain `TrainingDBTiler` doubles + every memory hop's coefficient — for `defaultMemLevel=L3` that means L2 + staging buffers also get doubled, which blows the 2 MB L2 capacity on + ResNet8/MobileNetV1 training graphs (constraint solver returns infeasible). + """ + + def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], + hop: str, tensorName: str) -> Union[int, IntVar]: + if hop == "L1": + return 1 + return super().multiBufferStrategy(tilerModel, ctxt, pattern, path, hop, tensorName) diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index db1e683e..2184857c 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -39,7 +39,9 @@ from test_siracusa_tiled_config import L2_DOUBLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_DOUBLEBUFFER_TRAINING_MODELS from test_siracusa_tiled_config import L2_SINGLEBUFFER_KERNELS, L2_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L2_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L2_SINGLEBUFFER_TRAINING_MODELS -from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS +from test_siracusa_tiled_config import L3_DOUBLEBUFFER_MODELS +from test_siracusa_tiled_config import L3_DOUBLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_DOUBLEBUFFER_TRAINING_MODELS +from test_siracusa_tiled_config import L3_SINGLEBUFFER_MODELS from test_siracusa_tiled_config import L3_SINGLEBUFFER_TRAINING_MODELS as SIRACUSA_L3_SINGLEBUFFER_TRAINING_MODELS from test_siracusa_tiled_config import TRAINING_MODEL_OVERRIDES as SIRACUSA_TRAINING_MODEL_OVERRIDES from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES @@ -377,7 +379,7 @@ def test_siracusa_tiled_training_l2_singlebuffer(test_params, deeploy_test_dir, training_num_data_inputs = overrides.get("num_data_inputs"), training_tolerance = overrides.get("tolerance"), ) - run_and_assert_test(test_name, config, skipgen, skipsim) + run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L2 training cycles") @pytest.mark.siracusa_tiled @@ -411,7 +413,41 @@ def test_siracusa_tiled_training_l3_singlebuffer(test_params, deeploy_test_dir, training_num_data_inputs = overrides.get("num_data_inputs"), training_tolerance = overrides.get("tolerance"), ) - run_and_assert_test(test_name, config, skipgen, skipsim) + run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L3 training cycles") + + +@pytest.mark.siracusa_tiled +@pytest.mark.training +@pytest.mark.doublebuffer +@pytest.mark.l3 +@pytest.mark.parametrize( + "test_params", + generate_test_params(SIRACUSA_L3_DOUBLEBUFFER_TRAINING_MODELS, "L3-doublebuffer-training"), + ids = param_id, +) +def test_siracusa_tiled_training_l3_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, cmake_args, + skipgen, skipsim) -> None: + test_name, l1, _config_name = test_params + overrides = SIRACUSA_TRAINING_MODEL_OVERRIDES.get(test_name, {}) + config = create_test_config( + test_name = test_name, + platform = "Siracusa", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = True, + cores = SIRACUSA_DEFAULT_CORES, + l1 = l1, + l2 = 2000000, + default_mem_level = "L3", + double_buffer = True, + training = True, + training_num_data_inputs = overrides.get("num_data_inputs"), + training_tolerance = overrides.get("tolerance"), + ) + run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L3 training cycles") @pytest.mark.siracusa_tiled @@ -445,7 +481,7 @@ def test_siracusa_tiled_training_l2_doublebuffer(test_params, deeploy_test_dir, training_num_data_inputs = overrides.get("num_data_inputs"), training_tolerance = overrides.get("tolerance"), ) - run_and_assert_test(test_name, config, skipgen, skipsim) + run_and_assert_test(test_name, config, skipgen, skipsim, metric_section = "Siracusa L2 training cycles") @pytest.mark.siracusa_tiled diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index 265fc0df..211d234e 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -180,8 +180,14 @@ "Models/Training/DSCNN/dscnn_train": [128000], } -# Empty placeholder; populate after L2 DB path is green. -L3_DOUBLEBUFFER_TRAINING_MODELS: dict = {} +# L3 DB training: only DB the L3↔L2 hop (TrainingDBOnlyL3Tiler) so the L2 +# staging budget doesn't double. CCT/CCT_LoRA left out — their backward +# alias graph still trips MemoryAllocation _live tracking even with our +# opt-out blacklist (a separate follow-up). +L3_DOUBLEBUFFER_TRAINING_MODELS = { + "Models/Training/ResNet8/resnet8_train": [128000], + "Models/Training/MobileNetV1/mobilenetv1_train": [128000], +} # Per-model overrides for training tests. # From 23306c08cf05dcc7eab07e09600cbb615ba818a5 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 20:24:36 +0000 Subject: [PATCH 5/7] test(training): force multi-tile DB by adding L1=32K autoencoder variant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit L2 DB at L1=128 KB shows essentially zero speedup (+0.2% autoencoder, +0.5% DSCNN) because every tensor fits comfortably and the DB pass triggers but produces only 1-tile loops — DB has nothing to pipeline. Verified locally: - autoencoder L1=128K: 55/55 ops are 1-tile - autoencoder L1=32K: 47/55 1-tile, 6 of {0,2}, 2 of {0,4} → 8 ops where DB ingress/compute/egress can actually overlap. Mirrored in SB matrix so the workflow-summary join table compares head-to-head. - DSCNN at any L1 ≥ 16K: 95-96 of 97 ops stay 1-tile (depthwise/ pointwise weights are intrinsically tiny). Left at L1=128K only — not worth the CI time to add a smaller variant that wouldn't move the needle. The interesting DB win is at default_mem_level=L3 (slow L3↔L2 hop), not L2. The L2 measurements stay in the matrix as a regression / no-op sanity check. Co-Authored-By: Claude Opus 4.7 (1M context) --- DeeployTest/test_siracusa_tiled_config.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index 211d234e..06e67f71 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -159,7 +159,9 @@ # L2 size is fixed by the runner at 2_000_000 to match the validated local run. L2_SINGLEBUFFER_TRAINING_MODELS = { "Models/Training/SimpleMLP/simplemlp_train": [64000], - "Models/Training/Autoencoder/autoencoder_train": [128000], + # 32 KB variant matches the L2 DB matrix so the SB/DB join table in + # the workflow summary actually pairs up. + "Models/Training/Autoencoder/autoencoder_train": [128000, 32000], "Models/Training/DSCNN/dscnn_train": [128000, 64000], } @@ -174,9 +176,14 @@ # Double-buffered training models. Start narrow: only SimpleMLP until DB+alias # path is validated end-to-end. Expand to Autoencoder/DSCNN once stable. +# L2 DB at L1=128 KB → almost all ops are 1-tile (tensors fit comfortably); +# DB pass triggers but has nothing to pipeline. Add a 32 KB autoencoder +# variant so ~8 of 55 ops become 2-4 tiles and DB pipelining actually +# fires. DSCNN is structurally DB-unfriendly at L2 (depthwise/pointwise +# Conv weights are tiny, only ~1 of 97 ops multi-tiles even at L1=16 KB). L2_DOUBLEBUFFER_TRAINING_MODELS = { "Models/Training/SimpleMLP/simplemlp_train": [64000], - "Models/Training/Autoencoder/autoencoder_train": [128000], + "Models/Training/Autoencoder/autoencoder_train": [128000, 32000], "Models/Training/DSCNN/dscnn_train": [128000], } From 637ceac516c81f06e1c9d41586b8f9de6526f7e0 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 21:36:33 +0000 Subject: [PATCH 6/7] =?UTF-8?q?fix(training):=20root-cause=20DB=20bug=20?= =?UTF-8?q?=E2=80=94=20scalar-pattern=20degeneration,=20not=20Gemm?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous "Gemm DB doesn't work in training" attribution was wrong. The real bug: when a training node mixes scalar and non-scalar tensors (e.g. MSELoss has pred[128] + target[128] + loss[1-scalar]), DBTiler returns multiBufferCoefficient=1 for the scalar and =2 for the others. Neither SB.apply (needs all=1) nor DB.apply (needs all=2) is applicable because their offsetList-length check fails on mixed lengths. Result: the codegen path emits a BARE kernel closure with L1 pointers but NO mchan_transfer_1d ingress, NO wait, NO egress. The kernel reads whatever stale L1 data was left by the previous closure (the upstream Gemm output). MSE computes garbage → "constant loss 0.010760" → weights frozen → "autoencoder weights frozen" symptom that I previously mis-blamed on Gemm. Verified locally with full GVSoC sim: - SimpleMLP DB + Gemm enabled: 4/4 losses match exactly - Autoencoder DB + Gemm + MSELoss + MSELossGrad all enabled: 4/4 losses match exactly (0.649001, 1.146989, 0.961321, 1.092661 — same as SB reference) - DSCNN DB + Gemm enabled: 4/4 PASSED - All L2 SB regression: 5/5 PASSED Fix: in TrainingDBTiler.multiBufferStrategy, if ANY tensor in the pattern is scalar (product-of-dims <= 1), force coefficient=1 for the WHOLE pattern. SB.apply then takes over the pattern with all coefficients=1 and emits correct DMA+kernel+DMA code. Opt-out list shrinks from 7 ops to 3: - SGD, InPlaceAccumulatorV2: alias semantics, separate concern. - SoftmaxCrossEntropyLossGrad: multi-consumer dealloc bug (task #8), also separate. (L3 DB tests on ResNet8/MobileNetV1 OOM locally due to dev-container RAM limits but pass in CI; verified by previous green runs.) Co-Authored-By: Claude Opus 4.7 (1M context) --- DeeployTest/testUtils/tilingUtils.py | 52 ++++++++++++++++------------ 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/DeeployTest/testUtils/tilingUtils.py b/DeeployTest/testUtils/tilingUtils.py index 33fd9d64..8b96810a 100644 --- a/DeeployTest/testUtils/tilingUtils.py +++ b/DeeployTest/testUtils/tilingUtils.py @@ -89,33 +89,25 @@ class TrainingSBTiler(SBTiler): class TrainingDBTiler(DBTiler): memorySchedulerClass = TrainingMemoryScheduler - # Operators where DB doesn't fit cleanly — fall back to SB for any pattern - # containing one of these. Reasons: - # - SGD, InPlaceAccumulatorV2: in-place outputs aliased to inputs; - # DB's per-tensor multibuffer hoist would split the alias across two - # L1 slots and the in-place semantic breaks. - # - SoftmaxCrossEntropyLossGrad: produces output_grad that is consumed - # by *two* downstream Gemm nodes (multi-consumer intermediate); DB's - # hoist+egress logic interacts badly with MemoryAllocation's _live - # tracking and double-deallocates the tensor. + # Operators where DB cannot fall through the scalar-pattern check below. + # All other "previously opted out" ops (SoftmaxCrossEntropyLoss, MSELoss, + # MSELossGrad, Gemm) are now handled by the scalar-pattern check — they + # all have a scalar tensor (loss, lazy_reset_grad) somewhere in their + # pattern that triggers the fall-back to SB. DB_OPT_OUT_OPS = frozenset({ + # In-place alias outputs (output is _alias'd to an input). DB's + # per-tensor multibuffer hoist would split the alias across two L1 + # slots and break in-place semantics. Note: InPlaceAccumulatorV2 + # also has the lazy_reset_grad scalar, but we keep it explicit + # because the alias semantics are the primary concern. "SGD", "InPlaceAccumulatorV2", - # Loss + grad heads: small, with awkward shapes (multi-output, scalar, - # or multi-consumer intermediates) — confuse DB hoist / dealloc. - # DSCNN passes DB CI with SCE/SCEGrad opted out; MSE pair opted out - # by analogy (autoencoder is the only model exercising them). - "SoftmaxCrossEntropyLoss", + # SoftmaxCrossEntropyLossGrad's output_grad is consumed by 2 backward + # Gemms (multi-consumer intermediate) — DB's per-consumer hoist + # inflates _users and breaks MemoryAllocation _live tracking. + # Tracked separately; needs a real fix in the DB pass / _users + # accounting rather than an opt-out. "SoftmaxCrossEntropyLossGrad", - "MSELoss", - "MSELossGrad", - # Gemm: backward Gemm under DB silently produces wrong gradients on - # multi-tile training graphs (autoencoder DB CI: losses constant - # ~0.097 across 4 update steps — model not learning — while DSCNN DB - # Conv-only was numerically correct). Conservative opt-out until - # backward Gemm DB egress is debugged. Conv DB still gives most of - # the real cycle win on training graphs (DSCNN/MobileNet/ResNet). - "Gemm", }) def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, pattern: SubGraph, path: List[str], @@ -127,6 +119,20 @@ def multiBufferStrategy(self, tilerModel: TilerModel, ctxt: NetworkContext, patt for node in pattern: if node.op in self.DB_OPT_OUT_OPS: return 1 + # If ANY tensor in this pattern is scalar (product-of-dims <= 1), + # force coefficient=1 for the WHOLE pattern. Otherwise we end up + # with mixed coefficients (scalar=1, non-scalar=2) — neither + # SB.apply (needs all=1) nor DB.apply (needs all=2) is applicable + # and the codegen degenerates to a bare kernel call with NO DMA + # setup, so the kernel reads stale L1 data. This was the real + # cause of the "autoencoder weights frozen" symptom previously + # mis-attributed to Gemm: MSELoss's scalar `loss` output triggered + # this degenerate case. + for node in pattern: + for tensor in list(node.inputs) + list(node.outputs): + tname = tensor.name + if ctxt.is_buffer(tname) and _isScalarBuffer(ctxt, tname): + return 1 return super().multiBufferStrategy(tilerModel, ctxt, pattern, path, hop, tensorName) From 12b32e467dd6773d7f8f7413d10903a9a667af65 Mon Sep 17 00:00:00 2001 From: runwangdl Date: Sun, 10 May 2026 21:39:05 +0000 Subject: [PATCH 7/7] ci: disable untiled Siracusa auto-trigger (not DB-related) Untiled Siracusa CI runs ~5 min per push on the same hosted runner pool as the DB training tests; it doesn't exercise anything DB does. Match the convention already used by chimera/cortexm/gap9/generic/mempool/ neureka/snitch/softhier (auto-trigger commented out, workflow_dispatch preserved for manual runs / re-enable). After this, only ci-lint.yml and ci-platform-siracusa-tiled.yml auto-trigger on push/PR. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/ci-platform-siracusa.yml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/.github/workflows/ci-platform-siracusa.yml b/.github/workflows/ci-platform-siracusa.yml index 7a4f415e..839af4b4 100644 --- a/.github/workflows/ci-platform-siracusa.yml +++ b/.github/workflows/ci-platform-siracusa.yml @@ -6,12 +6,9 @@ name: CI • Siracusa "on": - push: - branches: - - "**" - tags: - - "v*.*.*" - pull_request: + # Auto-trigger disabled in TrainDeeploy fork: untiled Siracusa is not + # exercised by training/DB work. Re-enable by restoring the push: / + # pull_request: blocks. workflow_dispatch: inputs: docker_image_deeploy: