From a7765a14df4ac0c62cfae568418950c3f23adb02 Mon Sep 17 00:00:00 2001
From: majin0824 <majin15@huawei.com>
Date: Wed, 15 Apr 2026 17:02:18 +0800
Subject: [PATCH 1/5] Refactor: migrate A5 examples and tests to SceneTestCase
 format

- Replace golden.py + kernel_config.py with unified test_*.py files
  using @scene_test decorator and SceneTestCase base class
- Covers examples/a5/{host_build_graph,tensormap_and_ringbuffer} (14 examples)
  and tests/st/a5/{host_build_graph,tensormap_and_ringbuffer} (3 tests)
- Add a5sim to platforms for all cases that support simulation
- Cross-directory kernel references use relative paths (../)
---
 .../paged_attention/golden.py                 |  58 ------
 .../paged_attention/kernels/kernel_config.py  |  76 --------
 .../paged_attention/test_paged_attention.py   | 118 +++++++++++++
 .../tensormap_and_ringbuffer/bgemm/golden.py  |  69 --------
 .../bgemm/kernels/kernel_config.py            |  49 ------
 .../bgemm/test_bgemm.py                       |  81 +++++++++
 .../mixed_example/golden.py                   | 122 -------------
 .../mixed_example/kernels/kernel_config.py    |  74 --------
 .../mixed_example/test_mixed_example.py       | 166 ++++++++++++++++++
 .../paged_attention/golden.py                 |  75 --------
 .../paged_attention/kernels/kernel_config.py  |  78 --------
 .../paged_attention/test_paged_attention.py   | 152 ++++++++++++++++
 .../spmd_basic/golden.py                      |  65 -------
 .../spmd_basic/kernels/kernel_config.py       |  50 ------
 .../spmd_basic/test_spmd_basic.py             |  75 ++++++++
 .../spmd_multiblock_aiv/golden.py             |  63 -------
 .../kernels/kernel_config.py                  |  38 ----
 .../test_spmd_multiblock_aiv.py               |  76 ++++++++
 .../spmd_multiblock_mix/golden.py             |  68 -------
 .../kernels/kernel_config.py                  |  50 ------
 .../test_spmd_multiblock_mix.py               |  82 +++++++++
 .../spmd_starvation/golden.py                 |  84 ---------
 .../spmd_starvation/kernels/kernel_config.py  |  52 ------
 .../spmd_starvation/test_spmd_starvation.py   | 101 +++++++++++
 .../spmd_sync_start/golden.py                 |  66 -------
 .../spmd_sync_start/kernels/kernel_config.py  |  51 ------
 .../spmd_sync_start/test_spmd_sync_start.py   |  80 +++++++++
 .../spmd_sync_start_aiv/golden.py             |  62 -------
 .../kernels/kernel_config.py                  |  40 -----
 .../test_spmd_sync_start_aiv.py               |  78 ++++++++
 .../spmd_sync_start_edge/golden.py            |  66 -------
 .../kernels/kernel_config.py                  |  51 ------
 .../test_spmd_sync_start_edge.py              |  85 +++++++++
 .../spmd_sync_start_stress/golden.py          | 104 -----------
 .../kernels/kernel_config.py                  |  61 -------
 .../test_spmd_sync_start_stress.py            | 112 ++++++++++++
 .../paged_attention/golden.py                 |  58 ------
 .../paged_attention/kernels/kernel_config.py  |  78 --------
 .../paged_attention/test_paged_attention.py   | 118 +++++++++++++
 .../paged_attention/golden.py                 |  63 -------
 .../paged_attention/kernels/kernel_config.py  |  78 --------
 .../paged_attention/test_paged_attention.py   | 134 ++++++++++++++
 .../paged_attention_unroll/golden.py          |  63 -------
 .../kernels/kernel_config.py                  |  78 --------
 .../test_paged_attention_unroll.py            | 133 ++++++++++++++
 45 files changed, 1591 insertions(+), 1990 deletions(-)
 delete mode 100644 examples/a5/host_build_graph/paged_attention/golden.py
 delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py
 create mode 100644 examples/a5/host_build_graph/paged_attention/test_paged_attention.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/bgemm/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/mixed_example/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_basic/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py
 delete mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py
 create mode 100644 examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
 delete mode 100644 tests/st/a5/host_build_graph/paged_attention/golden.py
 delete mode 100644 tests/st/a5/host_build_graph/paged_attention/kernels/kernel_config.py
 create mode 100644 tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py
 delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/golden.py
 delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
 create mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
 delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/golden.py
 delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py
 create mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py

diff --git a/examples/a5/host_build_graph/paged_attention/golden.py b/examples/a5/host_build_graph/paged_attention/golden.py
deleted file mode 100644
index e9672d5dc..000000000
--- a/examples/a5/host_build_graph/paged_attention/golden.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged Attention Golden - host_build_graph example (small scale, float16).
-
-Args layout: [query, key_cache, value_cache, block_table, context_lens, out, scale]
-  - Tensors retain original multi-dimensional shapes (ContinuousTensor metadata carries shape/dtype)
-  - scale is a scalar float parameter
-"""
-
-from simpler_setup.goldens.paged_attention import (
-    compute_golden,  # noqa: F401
-    run_golden_test,
-)
-from simpler_setup.goldens.paged_attention import generate_inputs as _generate_inputs
-
-__outputs__ = ["out"]
-
-RTOL = 1e-2
-ATOL = 1e-2
-
-ALL_CASES = {
-    "Case1": {
-        "batch": 1,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 16,
-        "block_size": 16,
-        "context_len": 16,
-        "max_model_len": 256,
-        "dtype": "float16",
-    },
-    "Case2": {
-        "batch": 1,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 16,
-        "block_size": 16,
-        "context_len": 64,
-        "max_model_len": 256,
-        "dtype": "float16",
-    },
-}
-
-DEFAULT_CASE = "Case1"
-
-
-def generate_inputs(params: dict) -> list:
-    return _generate_inputs(params)
-
-
-if __name__ == "__main__":
-    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py b/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py
deleted file mode 100644
index 0245cc8a5..000000000
--- a/examples/a5/host_build_graph/paged_attention/kernels/kernel_config.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Paged Attention Kernel and Orchestration Configuration
-
-Defines the kernels and orchestration function for paged attention
-with AIC/AIV subgraph splitting:
-
-AIC Kernels (Matrix Multiplication):
-  - aic_qk_matmul: Q @ K^T computation
-  - aic_pv_matmul: P @ V computation
-
-AIV Kernels (Vector Operations):
-  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
-  - aiv_online_update: online softmax accumulation + fused normalization
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-_KERNELS_ROOT = Path(__file__).parent
-
-# Orchestration config
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
-    "function_name": "build_paged_attention_graph",
-    "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-}
-
-# Kernel configs
-KERNELS = [
-    # AIC kernels (matrix multiplication using Cube unit)
-    {
-        "func_id": 0,
-        "name": "QK",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 2,
-        "name": "PV",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    # AIV kernels (vector operations)
-    {
-        "func_id": 1,
-        "name": "SF",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-    },
-    {
-        "func_id": 3,
-        "name": "UP",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-    },
-]
-
-# Runtime configuration
-RUNTIME_CONFIG = {
-    "runtime": "host_build_graph",
-    "aicpu_thread_num": 3,
-    "block_dim": 3,
-}
diff --git a/examples/a5/host_build_graph/paged_attention/test_paged_attention.py b/examples/a5/host_build_graph/paged_attention/test_paged_attention.py
new file mode 100644
index 000000000..7d72b6be1
--- /dev/null
+++ b/examples/a5/host_build_graph/paged_attention/test_paged_attention.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention — host_build_graph example (small scale, float16).
+
+AIC+AIV mixed execution with online softmax paged attention.
+Small-scale cases for quick validation on A5.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="host_build_graph")
+class TestPagedAttention(SceneTestCase):
+    """Paged attention with host_build_graph runtime on A5."""
+
+    RTOL = 1e-2
+    ATOL = 1e-2
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "build_paged_attention_graph",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 16,
+                "max_model_len": 256,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "manual": True,
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 64,
+                "max_model_len": 256,
+                "dtype": "float16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/bgemm/golden.py b/examples/a5/tensormap_and_ringbuffer/bgemm/golden.py
deleted file mode 100644
index 5ab0590c4..000000000
--- a/examples/a5/tensormap_and_ringbuffer/bgemm/golden.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Golden test specification for BGEMM (tensormap_and_ringbuffer Runtime).
-
-Computation: C = A @ B (tiled matrix multiplication)
-Configuration: 4x4x4 grid, 64x64 tiles
-
-Args layout: [A, B, C]  — shape/dtype/size in ContinuousTensor metadata
-"""
-
-import torch
-
-__outputs__ = ["C"]
-RTOL = 1e-3
-ATOL = 1e-3
-
-TILE_M = 64
-TILE_K = 64
-TILE_N = 64
-
-GRID_M = 4
-GRID_K = 4
-GRID_N = 4
-BATCH = 2
-
-M = TILE_M * GRID_M
-K = TILE_K * GRID_K
-N = TILE_N * GRID_N
-
-
-def generate_inputs(params: dict) -> list:
-    """Generate input tensors with tile-first memory layout."""
-    A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01
-    B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01
-    C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32)
-
-    A_flat = A.flatten()
-    B_flat = B.flatten()
-    C_flat = C.flatten()
-
-    return [
-        ("A", A_flat),
-        ("B", B_flat),
-        ("C", C_flat),
-    ]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    """Compute golden result: C[m,n] = sum(k) A[m,k] @ B[k,n]."""
-    A = torch.as_tensor(tensors["A"]).reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K)
-    B = torch.as_tensor(tensors["B"]).reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N)
-    C = torch.as_tensor(tensors["C"]).reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N)
-
-    C[:] = 0.0
-
-    for batch in range(BATCH):
-        for m_idx in range(GRID_M):
-            for n_idx in range(GRID_N):
-                for k_idx in range(GRID_K):
-                    C[batch, m_idx, n_idx] += torch.matmul(A[batch, m_idx, k_idx], B[batch, k_idx, n_idx])
-
-    tensors["C"][:] = C.flatten()
diff --git a/examples/a5/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py
deleted file mode 100644
index 91f2830ec..000000000
--- a/examples/a5/tensormap_and_ringbuffer/bgemm/kernels/kernel_config.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for BGEMM (tensormap_and_ringbuffer Runtime).
-
-Cube core (AIC) for matrix multiplication, Vector core (AIV) for accumulation.
-Uses TPUSH/TPOP for cube-to-vector data transfer (bypasses GM).
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-_KERNELS_ROOT = Path(__file__).parent
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "bgemm_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-    "signature": [D.IN, D.IN, D.OUT],
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "GEMM",
-        "source": str(_KERNELS_ROOT / "mix" / "kernel_bgemm.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 1,
-        "name": "ADD",
-        "source": str(_KERNELS_ROOT / "mix" / "kernel_bgemm.cpp"),
-        "core_type": "aiv",
-        "signature": [D.INOUT, D.IN],
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 3,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py b/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py
new file mode 100644
index 000000000..d7bc46a59
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""BGEMM: batched tiled matrix multiplication C = A @ B.
+
+Fixed 4x4x4 grid with 64x64 tiles, 2 batches.
+Cube core (AIC) for matmul, Vector core (AIV) for accumulation.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+TILE_M, TILE_K, TILE_N = 64, 64, 64
+GRID_M, GRID_K, GRID_N = 4, 4, 4
+BATCH = 2
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestBgemm(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/bgemm_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/mix/kernel_bgemm.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/mix/kernel_bgemm.cpp",
+                "core_type": "aiv",
+                "signature": [D.INOUT, D.IN],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "default",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {},
+        }
+    ]
+
+    def generate_args(self, params):
+        A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01
+        B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01
+        C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32)
+        return TaskArgsBuilder(Tensor("A", A.flatten()), Tensor("B", B.flatten()), Tensor("C", C.flatten()))
+
+    def compute_golden(self, args, params):
+        A = args.A.reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K)
+        B = args.B.reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N)
+        C = args.C.reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N)
+        C[:] = 0.0
+        for batch in range(BATCH):
+            for m in range(GRID_M):
+                for n in range(GRID_N):
+                    for k in range(GRID_K):
+                        C[batch, m, n] += torch.matmul(A[batch, m, k], B[batch, k, n])
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/golden.py b/examples/a5/tensormap_and_ringbuffer/mixed_example/golden.py
deleted file mode 100644
index acf60ee26..000000000
--- a/examples/a5/tensormap_and_ringbuffer/mixed_example/golden.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Golden test specification for mixed AIC+AIV example.
-
-Covers all 5 resource shapes per iteration:
-  1. AIC_AIV_X2: C = A@B, F = D+E, I = G*H
-  2. AIC_ONLY:   J = A@B
-  3. AIV_X1:     K = D+E
-  4. AIV_X2:     L = D+E, M = G*H
-  5. AIC_AIV_X1: N = A@B, O = D+E
-
-All use 128x128 float32 tiles, repeated over num_iters slices.
-
-Args layout (15 args): [A, B, C, D, E, F, G, H, I, J, K, L, M, N, O]
-  Shape/dtype/size in ContinuousTensor metadata.
-"""
-
-import torch
-
-__outputs__ = ["C", "F", "I", "J", "K", "L", "M", "N", "O"]
-RTOL = 1e-3
-ATOL = 1e-3
-
-ALL_CASES = {
-    "case1": {"num_iters": 4},
-    "case2": {"num_iters": 1},
-}
-
-DEFAULT_CASE = "case1"
-
-MATMUL_SIZE = 128
-TILE_ELEMS = 128 * 128
-
-
-def generate_inputs(params: dict) -> list:
-    num_iters = params["num_iters"]
-
-    torch.manual_seed(42)
-
-    # Matmul inputs (shared by AIC tasks)
-    A = torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01
-    B = torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01
-
-    # Add inputs (shared by AIV add tasks)
-    D = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
-    E = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
-
-    # Mul inputs (shared by AIV mul tasks)
-    G = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
-    H = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
-
-    # Output buffers (num_iters slices each)
-    C = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)
-    F = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)
-    I_out = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)  # noqa: E741
-    J = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)
-    K = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)
-    L = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)
-    M = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)
-    N = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)
-    O_out = torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)  # noqa: E741
-
-    A_flat = A.flatten()
-    B_flat = B.flatten()
-
-    return [
-        ("A", A_flat),
-        ("B", B_flat),
-        ("C", C),
-        ("D", D),
-        ("E", E),
-        ("F", F),
-        ("G", G),
-        ("H", H),
-        ("I", I_out),
-        ("J", J),
-        ("K", K),
-        ("L", L),
-        ("M", M),
-        ("N", N),
-        ("O", O_out),
-    ]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    num_iters = params["num_iters"]
-
-    A = torch.as_tensor(tensors["A"]).reshape(MATMUL_SIZE, MATMUL_SIZE)
-    B = torch.as_tensor(tensors["B"]).reshape(MATMUL_SIZE, MATMUL_SIZE)
-    D = torch.as_tensor(tensors["D"])
-    E = torch.as_tensor(tensors["E"])
-    G = torch.as_tensor(tensors["G"])
-    H = torch.as_tensor(tensors["H"])
-
-    golden_matmul = torch.matmul(A, B).flatten()
-    golden_add = D + E
-    golden_mul = G * H
-
-    for name in ["C", "J", "N"]:
-        out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS)
-        for i in range(num_iters):
-            out[i] = golden_matmul
-        tensors[name][:] = out.flatten()
-
-    for name in ["F", "K", "L", "O"]:
-        out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS)
-        for i in range(num_iters):
-            out[i] = golden_add
-        tensors[name][:] = out.flatten()
-
-    for name in ["I", "M"]:
-        out = torch.as_tensor(tensors[name]).reshape(num_iters, TILE_ELEMS)
-        for i in range(num_iters):
-            out[i] = golden_mul
-        tensors[name][:] = out.flatten()
diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py
deleted file mode 100644
index 796d2b782..000000000
--- a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/kernel_config.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for mixed AIC+AIV example (tensormap_and_ringbuffer Runtime).
-
-Covers all 5 resource shapes:
-  - AIC_ONLY:   standalone matmul
-  - AIV_X1:     standalone add
-  - AIV_X2:     add (AIV0) + mul (AIV1)
-  - AIC_AIV_X1: matmul (AIC) + add (AIV0)
-  - AIC_AIV_X2: matmul (AIC) + add (AIV0) + mul (AIV1)
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-_KERNELS_ROOT = Path(__file__).parent
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "mixed_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-    "signature": [D.IN, D.IN, D.OUT, D.IN, D.IN, D.OUT, D.IN, D.IN, D.OUT, D.OUT, D.OUT, D.OUT, D.OUT, D.OUT, D.OUT],
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "MATMUL",
-        "source": str(_KERNELS_ROOT / "aic" / "kernel_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 1,
-        "name": "ADD",
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_add.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 2,
-        "name": "MUL",
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_mul.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 3,
-        "name": "ADD_STANDALONE",
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_add_standalone.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 4,
-        "name": "MUL_STANDALONE",
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_mul_standalone.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 3,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py b/examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
new file mode 100644
index 000000000..37a8a92ed
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Mixed AIC+AIV example covering all 5 resource shapes.
+
+  1. AIC_AIV_X2: C = A@B, F = D+E, I = G*H
+  2. AIC_ONLY:   J = A@B
+  3. AIV_X1:     K = D+E
+  4. AIV_X2:     L = D+E, M = G*H
+  5. AIC_AIV_X1: N = A@B, O = D+E
+
+All use 128x128 float32 tiles, repeated over num_iters slices.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+MATMUL_SIZE = 128
+TILE_ELEMS = 128 * 128
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestMixedExample(SceneTestCase):
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/mixed_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [
+                D.IN,
+                D.IN,
+                D.OUT,
+                D.IN,
+                D.IN,
+                D.OUT,
+                D.IN,
+                D.IN,
+                D.OUT,
+                D.OUT,
+                D.OUT,
+                D.OUT,
+                D.OUT,
+                D.OUT,
+                D.OUT,
+            ],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/kernel_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/kernel_add.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": "kernels/aiv/kernel_mul.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": "kernels/aiv/kernel_add_standalone.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 4,
+                "source": "kernels/aiv/kernel_mul_standalone.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {"num_iters": 4},
+        },
+        {
+            "name": "case2",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "manual": True,
+            "params": {"num_iters": 1},
+        },
+    ]
+
+    def generate_args(self, params):
+        num_iters = params["num_iters"]
+        torch.manual_seed(42)
+
+        A = (torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01).flatten()
+        B = (torch.randn(MATMUL_SIZE, MATMUL_SIZE, dtype=torch.float32) * 0.01).flatten()
+        D = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+        E = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+        G = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+        H = torch.randn(TILE_ELEMS, dtype=torch.float32) * 0.01
+
+        def zeros():
+            return torch.zeros(num_iters * TILE_ELEMS, dtype=torch.float32)
+
+        return TaskArgsBuilder(
+            Tensor("A", A),
+            Tensor("B", B),
+            Tensor("C", zeros()),
+            Tensor("D", D),
+            Tensor("E", E),
+            Tensor("F", zeros()),
+            Tensor("G", G),
+            Tensor("H", H),
+            Tensor("I", zeros()),
+            Tensor("J", zeros()),
+            Tensor("K", zeros()),
+            Tensor("L", zeros()),
+            Tensor("M", zeros()),
+            Tensor("N", zeros()),
+            Tensor("O", zeros()),
+        )
+
+    def compute_golden(self, args, params):
+        num_iters = params["num_iters"]
+
+        A = args.A.reshape(MATMUL_SIZE, MATMUL_SIZE)
+        B = args.B.reshape(MATMUL_SIZE, MATMUL_SIZE)
+
+        golden_matmul = torch.matmul(A, B).flatten()
+        golden_add = args.D + args.E
+        golden_mul = args.G * args.H
+
+        for name in ["C", "J", "N"]:
+            out = getattr(args, name).reshape(num_iters, TILE_ELEMS)
+            for i in range(num_iters):
+                out[i] = golden_matmul
+
+        for name in ["F", "K", "L", "O"]:
+            out = getattr(args, name).reshape(num_iters, TILE_ELEMS)
+            for i in range(num_iters):
+                out[i] = golden_add
+
+        for name in ["I", "M"]:
+            out = getattr(args, name).reshape(num_iters, TILE_ELEMS)
+            for i in range(num_iters):
+                out[i] = golden_mul
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py
deleted file mode 100644
index 2b3842381..000000000
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/golden.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged Attention Golden - tensormap_and_ringbuffer example (small scale, float16)."""
-
-from simpler_setup.goldens.paged_attention import (
-    compute_golden,  # noqa: F401
-    run_golden_test,
-)
-from simpler_setup.goldens.paged_attention import generate_inputs as _generate_inputs
-
-__outputs__ = ["out"]
-
-RTOL = 1e-2
-ATOL = 1e-2
-
-ALL_CASES = {
-    "Case1": {
-        "batch": 1,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 16,
-        "block_size": 16,
-        "context_len": 33,
-        "max_model_len": 256,
-        "dtype": "float16",
-    },
-    "Case2": {
-        "batch": 1,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 16,
-        "block_size": 16,
-        "context_len": 128,
-        "max_model_len": 256,
-        "dtype": "float16",
-    },
-    "CaseVarSeq2": {
-        "batch": 2,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 16,
-        "block_size": 16,
-        "context_len": 33,
-        "context_lens_list": [33, 17],
-        "max_model_len": 256,
-        "dtype": "float16",
-    },
-    "CaseVarSeq4": {
-        "batch": 4,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 16,
-        "block_size": 16,
-        "context_len": 128,
-        "context_lens_list": [33, 64, 128, 15],
-        "max_model_len": 256,
-        "dtype": "float16",
-    },
-}
-
-DEFAULT_CASE = "Case1"
-
-
-def generate_inputs(params: dict) -> list:
-    return _generate_inputs(params)
-
-
-if __name__ == "__main__":
-    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
deleted file mode 100644
index eb373f968..000000000
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Paged Attention Kernel and Orchestration Configuration
-
-Defines the kernels and orchestration function for paged attention
-with AIC/AIV subgraph splitting:
-
-AIC Kernels (Matrix Multiplication):
-  - aic_qk_matmul: Q @ K^T computation
-  - aic_pv_matmul: P @ V computation
-
-AIV Kernels (Vector Operations):
-  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
-  - aiv_online_update: online softmax accumulation + fused normalization
-
-Note: aiv_normalize has been merged into aiv_online_update for efficiency.
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-_KERNELS_ROOT = Path(__file__).parent
-
-# Orchestration config
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-    "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-}
-
-# Kernel configs (aiv_normalize removed - merged into aiv_online_update)
-KERNELS = [
-    # AIC kernels (matrix multiplication using Cube unit)
-    {
-        "func_id": 0,
-        "name": "QK",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 2,
-        "name": "PV",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    # AIV kernels (vector operations)
-    {
-        "func_id": 1,
-        "name": "SF",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-    },
-    {
-        "func_id": 3,
-        "name": "UP",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-    },
-]
-
-# Runtime configuration
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
new file mode 100644
index 000000000..2e6eb99fb
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention — tensormap_and_ringbuffer example (small scale, float16).
+
+AIC+AIV mixed execution with online softmax paged attention.
+Small-scale cases including variable sequence lengths.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPagedAttention(SceneTestCase):
+    """Paged attention with tensormap_and_ringbuffer runtime on A5."""
+
+    RTOL = 1e-2
+    ATOL = 1e-2
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 33,
+                "max_model_len": 256,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 128,
+                "max_model_len": 256,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "CaseVarSeq2",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 2,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 33,
+                "context_lens_list": [33, 17],
+                "max_model_len": 256,
+                "dtype": "float16",
+            },
+        },
+        {
+            "name": "CaseVarSeq4",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 4,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 128,
+                "context_lens_list": [33, 64, 128, 15],
+                "max_model_len": 256,
+                "dtype": "float16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_basic/golden.py
deleted file mode 100644
index 0be689b66..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_basic/golden.py
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Golden test for SPMD context accessors (Phase 2: block_dim=1).
-
-Verifies that get_block_idx and get_block_num return correct values for all
-three subtask slots (AIC, AIV0, AIV1) in a MIX task, and that AIV
-kernels read the correct sub_block_id from GlobalContext.
-
-Phase 2 invariants: block_idx=0, block_num=1.
-GlobalContext: sub_block_id 0 (AIV0/left), 1 (AIV1/right).
-
-Output layout (float32[48], 3 cache lines):
-  [0..15]  = AIC  slot: [block_idx, block_num, pad x14]
-  [16..31] = AIV0 slot: [block_idx, block_num, sub_block_id=0, pad x13]
-  [32..47] = AIV1 slot: [block_idx, block_num, sub_block_id=1, pad x13]
-
-Args layout: [output]
-"""
-
-import torch
-
-__outputs__ = ["output"]
-RTOL = 0
-ATOL = 0
-
-ALL_CASES = {
-    "Case1": {},
-}
-
-DEFAULT_CASE = "Case1"
-
-# 16 floats per slot = 64 bytes = 1 cache line
-FLOATS_PER_CACHE_LINE = 16
-
-
-def generate_inputs(params: dict) -> list:
-    output = torch.zeros(3 * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
-    return [
-        ("output", output),
-    ]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    out = torch.as_tensor(tensors["output"])
-    # Cache line 0: AIC (no sub_block_id)
-    out[0] = 0.0  # block_idx
-    out[1] = 1.0  # block_num
-    # Cache line 1: AIV0 (sub_block_id=0)
-    base = 1 * FLOATS_PER_CACHE_LINE
-    out[base + 0] = 0.0  # block_idx
-    out[base + 1] = 1.0  # block_num
-    out[base + 2] = 0.0  # sub_block_id
-    # Cache line 2: AIV1 (sub_block_id=1)
-    base = 2 * FLOATS_PER_CACHE_LINE
-    out[base + 0] = 0.0  # block_idx
-    out[base + 1] = 1.0  # block_num
-    out[base + 2] = 1.0  # sub_block_id
-    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/kernel_config.py
deleted file mode 100644
index 8be342352..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/kernel_config.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for SPMD basic test (tensormap_and_ringbuffer Runtime).
-
-Submits a single MIX task (AIC + AIV0 + AIV1) so all three sub_block_id
-values are exercised in one dispatch.
-"""
-
-from pathlib import Path
-
-_KERNELS_ROOT = Path(__file__).parent
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_basic_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "SPMD_READ_AIC",
-        "source": str(_KERNELS_ROOT / "aic" / "kernel_spmd_read.cpp"),
-        "core_type": "aic",
-    },
-    {
-        "func_id": 1,
-        "name": "SPMD_READ_AIV0",
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_spmd_read.cpp"),
-        "core_type": "aiv",
-    },
-    {
-        "func_id": 2,
-        "name": "SPMD_READ_AIV1",
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_spmd_read.cpp"),
-        "core_type": "aiv",
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py b/examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
new file mode 100644
index 000000000..55d4cbfb7
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD basic context accessors: single MIX task verifying block_idx, block_num, sub_block_id.
+
+Submits one MIX task (AIC + AIV0 + AIV1) with block_dim=1.
+Each subtask writes its SPMD context at a sub_block_id-based offset.
+
+Output layout (float32[48], 3 cache lines):
+  [0..15]  = AIC  slot: [block_idx, block_num, pad x14]
+  [16..31] = AIV0 slot: [block_idx, block_num, sub_block_id=0, pad x13]
+  [32..47] = AIV1 slot: [block_idx, block_num, sub_block_id=1, pad x13]
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdBasic(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_basic_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {"func_id": 0, "source": "kernels/aic/kernel_spmd_read.cpp", "core_type": "aic"},
+            {"func_id": 1, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
+            {"func_id": 2, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        output = torch.zeros(3 * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+        return TaskArgsBuilder(Tensor("output", output))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        out[0] = 0.0
+        out[1] = 1.0
+        base = 1 * FLOATS_PER_CACHE_LINE
+        out[base + 0] = 0.0
+        out[base + 1] = 1.0
+        out[base + 2] = 0.0
+        base = 2 * FLOATS_PER_CACHE_LINE
+        out[base + 0] = 0.0
+        out[base + 1] = 1.0
+        out[base + 2] = 1.0
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/golden.py
deleted file mode 100644
index 5573fd274..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/golden.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Golden test for SPMD multi-block AIV.
-
-Submits five AIV tasks with block_num = 4, 16, 24, 48, 96 to verify:
-  T0 (block_num=4):  basic multi-block — fits within one sched thread
-  T1 (block_num=16): saturates one sched thread (8 clusters × 2 AIV)
-  T2 (block_num=24): forces cross-thread dispatch via ready_queue re-push
-  T3 (block_num=48): occupies all AIV cores across all 3 sched threads
-  T4 (block_num=96): two full rounds of all AIV cores
-
-Each block writes float(block_idx) at cache line (base_cl + block_idx).
-Output tensor: 188 cache lines = 3008 float32.
-
-Args layout: [output]
-"""
-
-import torch
-
-__outputs__ = ["output"]
-RTOL = 0
-ATOL = 0
-
-ALL_CASES = {
-    "Case1": {},
-}
-
-DEFAULT_CASE = "Case1"
-
-FLOATS_PER_CACHE_LINE = 16
-
-# (block_num, base_cl) for each submitted task
-TASKS = [
-    (4, 0),  # T0: basic
-    (16, 4),  # T1: saturate single thread
-    (24, 20),  # T2: cross-thread
-    (48, 44),  # T3: all AIV cores
-    (96, 92),  # T4: two full rounds
-]
-
-TOTAL_CL = sum(block_num for block_num, _ in TASKS)  # 44
-
-
-def generate_inputs(params: dict) -> list:
-    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
-    return [
-        ("output", output),
-    ]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    out = torch.as_tensor(tensors["output"])
-    for block_num, base_cl in TASKS:
-        for block_idx in range(block_num):
-            out[(base_cl + block_idx) * FLOATS_PER_CACHE_LINE] = float(block_idx)
-    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/kernel_config.py
deleted file mode 100644
index 68eccd9f7..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/kernel_config.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for SPMD multi-block AIV test (tensormap_and_ringbuffer Runtime).
-
-Submits a single AIV task with block_num=4 so each block writes its
-block_idx at a distinct cacheline-aligned offset.
-"""
-
-from pathlib import Path
-
-_KERNELS_ROOT = Path(__file__).parent
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_multiblock_aiv_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "SPMD_WRITE_AIV",
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_spmd_write.cpp"),
-        "core_type": "aiv",
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
new file mode 100644
index 000000000..58becb0b8
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
@@ -0,0 +1,76 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD multi-block AIV: five AIV tasks with varying block_num.
+
+  T0 (block_num=4):  basic multi-block
+  T1 (block_num=16): saturates one sched thread
+  T2 (block_num=24): forces cross-thread dispatch
+  T3 (block_num=48): occupies all AIV cores across all 3 sched threads
+  T4 (block_num=96): two full rounds of all AIV cores
+
+Each block writes float(block_idx) at cache line (base_cl + block_idx).
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+
+TASKS = [
+    (4, 0),
+    (16, 4),
+    (24, 20),
+    (48, 44),
+    (96, 92),
+]
+
+TOTAL_CL = sum(block_num for block_num, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdMultiblockAiv(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_multiblock_aiv_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {"func_id": 0, "source": "kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+        return TaskArgsBuilder(Tensor("output", output))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                out[(base_cl + block_idx) * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/golden.py
deleted file mode 100644
index 9751813d7..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/golden.py
+++ /dev/null
@@ -1,68 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Golden test for SPMD multi-block MIX.
-
-Submits five MIX tasks (AIC + AIV0 + AIV1) with block_num = 2, 8, 12, 24, 48 to verify:
-  T0 (block_num=2):  basic multi-block MIX
-  T1 (block_num=8):  saturates one sched thread (8 clusters)
-  T2 (block_num=12): forces cross-thread dispatch via ready_queue re-push
-  T3 (block_num=24): occupies all clusters across all 3 sched threads
-  T4 (block_num=48): two full rounds of all clusters
-
-Each block occupies 3 cache lines (AIC, AIV0, AIV1).  All three cores
-in the same block write the same float(block_idx) to their respective CL.
-
-Output tensor: 282 cache lines = 4512 float32.
-
-Args layout: [output]
-"""
-
-import torch
-
-__outputs__ = ["output"]
-RTOL = 0
-ATOL = 0
-
-ALL_CASES = {
-    "Case1": {},
-}
-
-DEFAULT_CASE = "Case1"
-
-FLOATS_PER_CACHE_LINE = 16
-SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
-
-# (block_num, base_cl) for each submitted task
-TASKS = [
-    (2, 0),  # T0: basic MIX (6 CL)
-    (8, 6),  # T1: saturate single thread (24 CL)
-    (12, 30),  # T2: cross-thread (36 CL)
-    (24, 66),  # T3: all clusters (72 CL)
-    (48, 138),  # T4: two full rounds (144 CL)
-]
-
-TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)  # 66
-
-
-def generate_inputs(params: dict) -> list:
-    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
-    return [
-        ("output", output),
-    ]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    out = torch.as_tensor(tensors["output"])
-    for block_num, base_cl in TASKS:
-        for block_idx in range(block_num):
-            for slot in range(SLOTS_PER_BLOCK):
-                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
-                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
-    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/kernel_config.py
deleted file mode 100644
index 9f7a517ef..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/kernel_config.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for SPMD multi-block MIX test (tensormap_and_ringbuffer Runtime).
-
-Submits a single MIX task (AIC + AIV0 + AIV1) with block_num=2 so all
-three subtask slots in both blocks see the correct block_idx.
-"""
-
-from pathlib import Path
-
-_KERNELS_ROOT = Path(__file__).parent
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_multiblock_mix_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "SPMD_MIX_AIC",
-        "source": str(_KERNELS_ROOT / "aic" / "kernel_spmd_mix.cpp"),
-        "core_type": "aic",
-    },
-    {
-        "func_id": 1,
-        "name": "SPMD_MIX_AIV0",
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_spmd_mix.cpp"),
-        "core_type": "aiv",
-    },
-    {
-        "func_id": 2,
-        "name": "SPMD_MIX_AIV1",
-        "source": str(_KERNELS_ROOT / "aiv" / "kernel_spmd_mix.cpp"),
-        "core_type": "aiv",
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
new file mode 100644
index 000000000..1bac22c74
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD multi-block MIX: five MIX tasks with varying block_num.
+
+  T0 (block_num=2):  basic multi-block MIX
+  T1 (block_num=8):  saturates one sched thread
+  T2 (block_num=12): forces cross-thread dispatch
+  T3 (block_num=24): occupies all clusters across all 3 sched threads
+  T4 (block_num=48): two full rounds of all clusters
+
+Each block occupies 3 cache lines (AIC, AIV0, AIV1). All three cores
+in the same block write float(block_idx) to their respective CL.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3
+
+TASKS = [
+    (2, 0),
+    (8, 6),
+    (12, 30),
+    (24, 66),
+    (48, 138),
+]
+
+TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdMultiblockMix(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_multiblock_mix_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {"func_id": 0, "source": "kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
+            {"func_id": 1, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 2, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+        return TaskArgsBuilder(Tensor("output", output))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                for slot in range(SLOTS_PER_BLOCK):
+                    cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py
deleted file mode 100644
index 2e85b0fb6..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/golden.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Golden test for SPMD starvation prevention.
-
-Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks and
-verifies all 20 tasks complete with correct output.  The test validates that
-the drain mechanism prevents sync_start tasks from being starved.
-
-Layout:
-  Wave 1: 6 x normal(block_num=4)  -> CL 0..71
-  Sync 0: 1 x sync_start(block_num=6) -> CL 72..89
-  Wave 2: 6 x normal(block_num=4)  -> CL 90..161
-  Sync 1: 1 x sync_start(block_num=6) -> CL 162..179
-  Wave 3: 6 x normal(block_num=4)  -> CL 180..251
-
-Total: 252 CL = 4032 float32.
-
-Args layout: [output]
-"""
-
-import torch
-
-__outputs__ = ["output"]
-RTOL = 0
-ATOL = 0
-
-ALL_CASES = {
-    "Case1": {},
-}
-
-DEFAULT_CASE = "Case1"
-
-FLOATS_PER_CACHE_LINE = 16
-SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
-NORMAL_BLOCK_NUM = 4
-SYNC_BLOCK_NUM = 6
-NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK  # 12
-SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK  # 18
-
-
-# Build flat task list as (block_num, base_cl)
-def _build_tasks():
-    tasks = []
-    cl = 0
-    for _ in range(6):
-        tasks.append((NORMAL_BLOCK_NUM, cl))
-        cl += NORMAL_CL
-    tasks.append((SYNC_BLOCK_NUM, cl))
-    cl += SYNC_CL
-    for _ in range(6):
-        tasks.append((NORMAL_BLOCK_NUM, cl))
-        cl += NORMAL_CL
-    tasks.append((SYNC_BLOCK_NUM, cl))
-    cl += SYNC_CL
-    for _ in range(6):
-        tasks.append((NORMAL_BLOCK_NUM, cl))
-        cl += NORMAL_CL
-    return tasks
-
-
-TASKS = _build_tasks()
-TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS)  # 252
-
-
-def generate_inputs(params: dict) -> list:
-    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
-    return [("output", output)]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    out = torch.as_tensor(tensors["output"])
-    for block_num, base_cl in TASKS:
-        for block_idx in range(block_num):
-            for slot in range(SLOTS_PER_BLOCK):
-                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
-                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
-    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py
deleted file mode 100644
index 602265c7e..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/kernel_config.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for SPMD starvation prevention test (tensormap_and_ringbuffer Runtime).
-
-Submits 18 normal MIX tasks interleaved with 2 sync_start MIX tasks to verify
-the drain mechanism prevents sync_start tasks from being starved.
-Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
-"""
-
-from pathlib import Path
-
-_KERNELS_ROOT = Path(__file__).parent
-_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_starvation_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "SPMD_MIX_AIC",
-        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
-        "core_type": "aic",
-    },
-    {
-        "func_id": 1,
-        "name": "SPMD_MIX_AIV0",
-        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
-        "core_type": "aiv",
-    },
-    {
-        "func_id": 2,
-        "name": "SPMD_MIX_AIV1",
-        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
-        "core_type": "aiv",
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
new file mode 100644
index 000000000..425ccdab0
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD starvation prevention: 18 normal MIX + 2 sync_start MIX tasks.
+
+Validates that the drain mechanism prevents sync_start tasks from being starved.
+
+Layout:
+  Wave 1: 6 x normal(block_num=4)      -> CL 0..71
+  Sync 0: 1 x sync_start(block_num=6)  -> CL 72..89
+  Wave 2: 6 x normal(block_num=4)      -> CL 90..161
+  Sync 1: 1 x sync_start(block_num=6)  -> CL 162..179
+  Wave 3: 6 x normal(block_num=4)      -> CL 180..251
+
+Total: 252 CL = 4032 float32.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3
+NORMAL_BLOCK_NUM = 4
+SYNC_BLOCK_NUM = 6
+NORMAL_CL = NORMAL_BLOCK_NUM * SLOTS_PER_BLOCK
+SYNC_CL = SYNC_BLOCK_NUM * SLOTS_PER_BLOCK
+
+
+def _build_tasks():
+    tasks = []
+    cl = 0
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    tasks.append((SYNC_BLOCK_NUM, cl))
+    cl += SYNC_CL
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    tasks.append((SYNC_BLOCK_NUM, cl))
+    cl += SYNC_CL
+    for _ in range(6):
+        tasks.append((NORMAL_BLOCK_NUM, cl))
+        cl += NORMAL_CL
+    return tasks
+
+
+TASKS = _build_tasks()
+TOTAL_CL = sum(bn * SLOTS_PER_BLOCK for bn, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdStarvation(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_starvation_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
+            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+        return TaskArgsBuilder(Tensor("output", output))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                for slot in range(SLOTS_PER_BLOCK):
+                    cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py
deleted file mode 100644
index 33acd1c1a..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/golden.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Golden test for SPMD sync_start.
-
-Submits 4 MIX tasks (3 with require_sync_start=true, 1 baseline) and verifies
-all blocks of every task write the correct float(block_idx) to their cache line.
-
-Tasks (AIC=slot0, AIV0=slot1, AIV1=slot2):
-  T0: block_num=2,  sync_start=True  -> CL 0..5
-  T1: block_num=8,  sync_start=True  -> CL 6..29
-  T2: block_num=2,  sync_start=False -> CL 30..35  (baseline)
-  T3: block_num=12, sync_start=True  -> CL 36..71
-
-Output tensor: 72 cache lines = 1152 float32.
-
-Args layout: [output]
-"""
-
-import torch
-
-__outputs__ = ["output"]
-RTOL = 0
-ATOL = 0
-
-ALL_CASES = {
-    "Case1": {},
-}
-
-DEFAULT_CASE = "Case1"
-
-FLOATS_PER_CACHE_LINE = 16
-SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
-
-# (block_num, base_cl) for each submitted task
-TASKS = [
-    (2, 0),  # T0: sync_start=True
-    (8, 6),  # T1: sync_start=True
-    (2, 30),  # T2: sync_start=False (baseline)
-    (12, 36),  # T3: sync_start=True
-]
-
-TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)  # 72
-
-
-def generate_inputs(params: dict) -> list:
-    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
-    return [
-        ("output", output),
-    ]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    out = torch.as_tensor(tensors["output"])
-    for block_num, base_cl in TASKS:
-        for block_idx in range(block_num):
-            for slot in range(SLOTS_PER_BLOCK):
-                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
-                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
-    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py
deleted file mode 100644
index c689263d5..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/kernel_config.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for SPMD sync_start test (tensormap_and_ringbuffer Runtime).
-
-Submits MIX tasks with require_sync_start=true to verify atomic batch launch.
-Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
-"""
-
-from pathlib import Path
-
-_KERNELS_ROOT = Path(__file__).parent
-_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "SPMD_MIX_AIC",
-        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
-        "core_type": "aic",
-    },
-    {
-        "func_id": 1,
-        "name": "SPMD_MIX_AIV0",
-        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
-        "core_type": "aiv",
-    },
-    {
-        "func_id": 2,
-        "name": "SPMD_MIX_AIV1",
-        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
-        "core_type": "aiv",
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
new file mode 100644
index 000000000..18320397e
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD sync_start: 4 MIX tasks (3 sync_start + 1 baseline).
+
+Tasks (AIC=slot0, AIV0=slot1, AIV1=slot2):
+  T0: block_num=2,  sync_start=True  -> CL 0..5
+  T1: block_num=8,  sync_start=True  -> CL 6..29
+  T2: block_num=2,  sync_start=False -> CL 30..35  (baseline)
+  T3: block_num=12, sync_start=True  -> CL 36..71
+
+Output tensor: 72 cache lines = 1152 float32.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3
+
+TASKS = [
+    (2, 0),
+    (8, 6),
+    (2, 30),
+    (12, 36),
+]
+
+TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdSyncStart(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_sync_start_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
+            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+        return TaskArgsBuilder(Tensor("output", output))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                for slot in range(SLOTS_PER_BLOCK):
+                    cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py
deleted file mode 100644
index 3c60f1ac8..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/golden.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Golden test for SPMD sync_start with AIV-only tasks.
-
-Submits 4 AIV tasks (3 with require_sync_start=true, 1 baseline) to exercise
-the AIV-specific fast path (count_idle_aiv_cores) and drain slow path.
-
-Tasks:
-  T0: block_num=4,  sync_start=True  -> CL 0..3    (fast path)
-  T1: block_num=16, sync_start=True  -> CL 4..19   (saturate one thread)
-  T2: block_num=4,  sync_start=False -> CL 20..23  (baseline)
-  T3: block_num=24, sync_start=True  -> CL 24..47  (cross-thread drain)
-
-Output tensor: 48 cache lines = 768 float32.
-
-Args layout: [output]
-"""
-
-import torch
-
-__outputs__ = ["output"]
-RTOL = 0
-ATOL = 0
-
-ALL_CASES = {
-    "Case1": {},
-}
-
-DEFAULT_CASE = "Case1"
-
-FLOATS_PER_CACHE_LINE = 16
-
-# (block_num, base_cl) for each submitted task
-TASKS = [
-    (4, 0),  # T0: sync_start=True, fast path
-    (16, 4),  # T1: sync_start=True, saturate single thread
-    (4, 20),  # T2: sync_start=False, baseline
-    (24, 24),  # T3: sync_start=True, cross-thread drain
-]
-
-TOTAL_CL = sum(block_num for block_num, _ in TASKS)  # 48
-
-
-def generate_inputs(params: dict) -> list:
-    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
-    return [("output", output)]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    out = torch.as_tensor(tensors["output"])
-    for block_num, base_cl in TASKS:
-        for block_idx in range(block_num):
-            cl = base_cl + block_idx
-            out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
-    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py
deleted file mode 100644
index bb97aaee2..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/kernel_config.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for SPMD sync_start AIV test (tensormap_and_ringbuffer Runtime).
-
-Submits AIV tasks with require_sync_start=true to verify atomic batch launch
-and the AIV-specific fast path (count_idle_aiv_cores).
-Reuses the same AIV kernel from spmd_multiblock_aiv.
-"""
-
-from pathlib import Path
-
-_KERNELS_ROOT = Path(__file__).parent
-_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels"
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_aiv_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "SPMD_WRITE_AIV",
-        "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"),
-        "core_type": "aiv",
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
new file mode 100644
index 000000000..8a434caa5
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD sync_start AIV: 4 AIV tasks (3 sync_start + 1 baseline).
+
+Exercises AIV-specific fast path (count_idle_aiv_cores) and drain slow path.
+
+Tasks:
+  T0: block_num=4,  sync_start=True  -> CL 0..3    (fast path)
+  T1: block_num=16, sync_start=True  -> CL 4..19   (saturate one thread)
+  T2: block_num=4,  sync_start=False -> CL 20..23  (baseline)
+  T3: block_num=24, sync_start=True  -> CL 24..47  (cross-thread drain)
+
+Output tensor: 48 cache lines = 768 float32.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+
+TASKS = [
+    (4, 0),
+    (16, 4),
+    (4, 20),
+    (24, 24),
+]
+
+TOTAL_CL = sum(block_num for block_num, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdSyncStartAiv(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_sync_start_aiv_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {"func_id": 0, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+        return TaskArgsBuilder(Tensor("output", output))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                cl = base_cl + block_idx
+                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py
deleted file mode 100644
index 2bfcaea4a..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/golden.py
+++ /dev/null
@@ -1,66 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Golden test for SPMD sync_start boundary conditions.
-
-Tests edge-case block_num values relative to per-thread cluster capacity (8 clusters
-with 3 sched threads = 24 total clusters, 48 total AIV cores).
-
-MIX tasks (SLOTS_PER_BLOCK=3):
-  T0: block_num=1,  sync_start=True  -> CL 0..2     (degenerate: always fast path)
-  T1: block_num=8,  sync_start=True  -> CL 3..26    (exactly one thread's capacity)
-  T2: block_num=9,  sync_start=True  -> CL 27..53   (one over: must enter drain)
-  T3: block_num=23, sync_start=True  -> CL 54..122  (max valid: total_clusters - 1)
-  T4: block_num=1,  sync_start=False -> CL 123..125  (baseline)
-
-Output tensor: 126 cache lines = 2016 float32.
-
-Args layout: [output]
-"""
-
-import torch
-
-__outputs__ = ["output"]
-RTOL = 0
-ATOL = 0
-
-ALL_CASES = {
-    "Case1": {},
-}
-
-DEFAULT_CASE = "Case1"
-
-FLOATS_PER_CACHE_LINE = 16
-SLOTS_PER_BLOCK = 3  # AIC, AIV0, AIV1
-
-# (block_num, base_cl) for each submitted task
-TASKS = [
-    (1, 0),  # T0: sync=True, degenerate
-    (8, 3),  # T1: sync=True, exactly one thread's clusters
-    (9, 27),  # T2: sync=True, one over -> drain
-    (23, 54),  # T3: sync=True, max valid (total_clusters - 1)
-    (1, 123),  # T4: sync=False, baseline
-]
-
-TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)  # 126
-
-
-def generate_inputs(params: dict) -> list:
-    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
-    return [("output", output)]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    out = torch.as_tensor(tensors["output"])
-    for block_num, base_cl in TASKS:
-        for block_idx in range(block_num):
-            for slot in range(SLOTS_PER_BLOCK):
-                cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
-                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
-    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py
deleted file mode 100644
index 30a9ebd1f..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/kernel_config.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for SPMD sync_start boundary test (tensormap_and_ringbuffer Runtime).
-
-Tests edge-case block_num values relative to per-thread cluster capacity.
-Reuses the same AIC/AIV kernels from spmd_multiblock_mix.
-"""
-
-from pathlib import Path
-
-_KERNELS_ROOT = Path(__file__).parent
-_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_edge_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-}
-
-KERNELS = [
-    {
-        "func_id": 0,
-        "name": "SPMD_MIX_AIC",
-        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
-        "core_type": "aic",
-    },
-    {
-        "func_id": 1,
-        "name": "SPMD_MIX_AIV0",
-        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
-        "core_type": "aiv",
-    },
-    {
-        "func_id": 2,
-        "name": "SPMD_MIX_AIV1",
-        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
-        "core_type": "aiv",
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
new file mode 100644
index 000000000..11a728a02
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD sync_start boundary conditions.
+
+Tests edge-case block_num values relative to per-thread cluster capacity
+(8 clusters x 3 sched threads = 24 total clusters, 48 total AIV cores).
+
+MIX tasks (SLOTS_PER_BLOCK=3):
+  T0: block_num=1,  sync_start=True  -> CL 0..2     (degenerate: always fast path)
+  T1: block_num=8,  sync_start=True  -> CL 3..26    (exactly one thread's capacity)
+  T2: block_num=9,  sync_start=True  -> CL 27..53   (one over: must enter drain)
+  T3: block_num=23, sync_start=True  -> CL 54..122  (max valid: total_clusters - 1)
+  T4: block_num=1,  sync_start=False -> CL 123..125  (baseline)
+
+Output tensor: 126 cache lines = 2016 float32.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+SLOTS_PER_BLOCK = 3
+
+TASKS = [
+    (1, 0),
+    (8, 3),
+    (9, 27),
+    (23, 54),
+    (1, 123),
+]
+
+TOTAL_CL = sum(block_num * SLOTS_PER_BLOCK for block_num, _ in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdSyncStartEdge(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_sync_start_edge_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
+            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+        return TaskArgsBuilder(Tensor("output", output))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl in TASKS:
+            for block_idx in range(block_num):
+                for slot in range(SLOTS_PER_BLOCK):
+                    cl = base_cl + block_idx * SLOTS_PER_BLOCK + slot
+                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py
deleted file mode 100644
index 3315360df..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/golden.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Golden test for SPMD sync_start stress / CAS contention with mixed shapes.
-
-Submits 6 rounds of mixed-shape tasks to stress drain CAS contention, ack
-barrier, and state cleanup across drain cycles.  All three resource shapes
-(MIX, AIV, AIC) are exercised with both sync and non-sync modes.
-
-Each round (9 tasks):
-  4 x normal MIX  (block_num=4, sync=false) -> 4 x 4 x 3 = 48 CL
-  2 x sync MIX    (block_num=12, sync=true) -> 2 x 12 x 3 = 72 CL
-  2 x sync AIV    (block_num=8, sync=true)  -> 2 x 8 x 1 = 16 CL
-  1 x normal AIV  (block_num=4, sync=false) -> 1 x 4 x 1 = 4 CL
-  Round total: 140 CL
-
-6 rounds -> 54 tasks (24 normal MIX + 12 sync MIX + 12 sync AIV + 6 normal AIV)
-Grand total: 840 CL = 13440 float32
-
-Args layout: [output]
-"""
-
-import torch
-
-__outputs__ = ["output"]
-RTOL = 0
-ATOL = 0
-
-ALL_CASES = {
-    "Case1": {},
-}
-
-DEFAULT_CASE = "Case1"
-
-FLOATS_PER_CACHE_LINE = 16
-ROUNDS = 6
-
-# shape constants: (slots_per_block, written_slots)
-# MIX: kernel writes at base_cl + block_idx * 3 + {0,1,2}, 3 CL per block, all written
-# AIV: kernel writes at base_cl + block_idx, 1 CL per block
-SHAPE_MIX = "MIX"
-SHAPE_AIV = "AIV"
-
-MIX_SLOTS = 3
-AIV_SLOTS = 1
-
-NORMAL_MIX_BN = 4
-SYNC_MIX_BN = 12
-SYNC_AIV_BN = 8
-NORMAL_AIV_BN = 4
-
-
-def _build_tasks():
-    """Returns list of (block_num, base_cl, shape_str)."""
-    tasks = []
-    cl = 0
-    for _ in range(ROUNDS):
-        # 4 x normal MIX
-        for _ in range(4):
-            tasks.append((NORMAL_MIX_BN, cl, SHAPE_MIX))
-            cl += NORMAL_MIX_BN * MIX_SLOTS
-        # 2 x sync MIX
-        for _ in range(2):
-            tasks.append((SYNC_MIX_BN, cl, SHAPE_MIX))
-            cl += SYNC_MIX_BN * MIX_SLOTS
-        # 2 x sync AIV
-        for _ in range(2):
-            tasks.append((SYNC_AIV_BN, cl, SHAPE_AIV))
-            cl += SYNC_AIV_BN * AIV_SLOTS
-        # 1 x normal AIV
-        tasks.append((NORMAL_AIV_BN, cl, SHAPE_AIV))
-        cl += NORMAL_AIV_BN * AIV_SLOTS
-    return tasks
-
-
-TASKS = _build_tasks()
-TOTAL_CL = sum(bn * (MIX_SLOTS if shape == SHAPE_MIX else AIV_SLOTS) for bn, _, shape in TASKS)  # 840
-
-
-def generate_inputs(params: dict) -> list:
-    output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
-    return [("output", output)]
-
-
-def compute_golden(tensors: dict, params: dict) -> None:
-    out = torch.as_tensor(tensors["output"])
-    for block_num, base_cl, shape in TASKS:
-        for block_idx in range(block_num):
-            if shape == SHAPE_MIX:
-                # MIX kernel writes float(block_idx) at all 3 slots
-                for slot in range(MIX_SLOTS):
-                    cl = base_cl + block_idx * MIX_SLOTS + slot
-                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
-            else:
-                # AIV kernel writes float(block_idx) at 1 slot
-                cl = base_cl + block_idx
-                out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
-    tensors["output"][:] = out
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py
deleted file mode 100644
index d04b6b27f..000000000
--- a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/kernel_config.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Kernel configuration for SPMD sync_start stress test with mixed shapes.
-
-Submits 54 tasks (MIX + AIV) over 6 rounds to stress-test drain CAS contention,
-ack barrier, and state cleanup between drain cycles.
-Reuses AIC/AIV kernels from spmd_multiblock_mix and spmd_multiblock_aiv.
-"""
-
-from pathlib import Path
-
-_KERNELS_ROOT = Path(__file__).parent
-_MIX_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_mix" / "kernels"
-_AIV_KERNELS = _KERNELS_ROOT.parent.parent / "spmd_multiblock_aiv" / "kernels"
-
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "spmd_sync_start_stress_orch.cpp"),
-    "function_name": "aicpu_orchestration_entry",
-}
-
-KERNELS = [
-    # func_id 0-2: MIX kernels (AIC + AIV0 + AIV1)
-    {
-        "func_id": 0,
-        "name": "SPMD_MIX_AIC",
-        "source": str(_MIX_KERNELS / "aic" / "kernel_spmd_mix.cpp"),
-        "core_type": "aic",
-    },
-    {
-        "func_id": 1,
-        "name": "SPMD_MIX_AIV0",
-        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
-        "core_type": "aiv",
-    },
-    {
-        "func_id": 2,
-        "name": "SPMD_MIX_AIV1",
-        "source": str(_MIX_KERNELS / "aiv" / "kernel_spmd_mix.cpp"),
-        "core_type": "aiv",
-    },
-    # func_id 3: standalone AIV kernel
-    {
-        "func_id": 3,
-        "name": "SPMD_WRITE_AIV",
-        "source": str(_AIV_KERNELS / "aiv" / "kernel_spmd_write.cpp"),
-        "core_type": "aiv",
-    },
-]
-
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
new file mode 100644
index 000000000..a87eb7209
--- /dev/null
+++ b/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""SPMD sync_start stress with mixed shapes (MIX + AIV).
+
+Submits 6 rounds of mixed-shape tasks to stress drain CAS contention,
+ack barrier, and state cleanup across drain cycles.
+
+Each round (9 tasks):
+  4 x normal MIX  (block_num=4,  sync=false) -> 48 CL
+  2 x sync MIX    (block_num=12, sync=true)  -> 72 CL
+  2 x sync AIV    (block_num=8,  sync=true)  -> 16 CL
+  1 x normal AIV  (block_num=4,  sync=false) ->  4 CL
+  Round total: 140 CL
+
+6 rounds -> 54 tasks, grand total: 840 CL = 13440 float32.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+
+FLOATS_PER_CACHE_LINE = 16
+ROUNDS = 6
+
+SHAPE_MIX = "MIX"
+SHAPE_AIV = "AIV"
+MIX_SLOTS = 3
+AIV_SLOTS = 1
+
+NORMAL_MIX_BN = 4
+SYNC_MIX_BN = 12
+SYNC_AIV_BN = 8
+NORMAL_AIV_BN = 4
+
+
+def _build_tasks():
+    tasks = []
+    cl = 0
+    for _ in range(ROUNDS):
+        for _ in range(4):
+            tasks.append((NORMAL_MIX_BN, cl, SHAPE_MIX))
+            cl += NORMAL_MIX_BN * MIX_SLOTS
+        for _ in range(2):
+            tasks.append((SYNC_MIX_BN, cl, SHAPE_MIX))
+            cl += SYNC_MIX_BN * MIX_SLOTS
+        for _ in range(2):
+            tasks.append((SYNC_AIV_BN, cl, SHAPE_AIV))
+            cl += SYNC_AIV_BN * AIV_SLOTS
+        tasks.append((NORMAL_AIV_BN, cl, SHAPE_AIV))
+        cl += NORMAL_AIV_BN * AIV_SLOTS
+    return tasks
+
+
+TASKS = _build_tasks()
+TOTAL_CL = sum(bn * (MIX_SLOTS if shape == SHAPE_MIX else AIV_SLOTS) for bn, _, shape in TASKS)
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestSpmdSyncStartStress(SceneTestCase):
+    RTOL = 0
+    ATOL = 0
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/spmd_sync_start_stress_orch.cpp",
+            "function_name": "aicpu_orchestration_entry",
+            "signature": [D.INOUT],
+        },
+        "incores": [
+            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
+            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 3, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {},
+        },
+    ]
+
+    def generate_args(self, params):
+        output = torch.zeros(TOTAL_CL * FLOATS_PER_CACHE_LINE, dtype=torch.float32)
+        return TaskArgsBuilder(Tensor("output", output))
+
+    def compute_golden(self, args, params):
+        out = args.output
+        for block_num, base_cl, shape in TASKS:
+            for block_idx in range(block_num):
+                if shape == SHAPE_MIX:
+                    for slot in range(MIX_SLOTS):
+                        cl = base_cl + block_idx * MIX_SLOTS + slot
+                        out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+                else:
+                    cl = base_cl + block_idx
+                    out[cl * FLOATS_PER_CACHE_LINE] = float(block_idx)
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a5/host_build_graph/paged_attention/golden.py b/tests/st/a5/host_build_graph/paged_attention/golden.py
deleted file mode 100644
index 623712602..000000000
--- a/tests/st/a5/host_build_graph/paged_attention/golden.py
+++ /dev/null
@@ -1,58 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged Attention Golden - host_build_graph test (production scale, bfloat16).
-
-Args layout: [query, key_cache, value_cache, block_table, context_lens, out, scale]
-  - Tensors retain original multi-dimensional shapes (ContinuousTensor metadata carries shape/dtype)
-  - scale is a scalar float parameter
-"""
-
-from simpler_setup.goldens.paged_attention import (
-    compute_golden,  # noqa: F401
-    run_golden_test,
-)
-from simpler_setup.goldens.paged_attention import generate_inputs as _generate_inputs
-
-__outputs__ = ["out"]
-
-RTOL = 1e-3
-ATOL = 1e-3
-
-ALL_CASES = {
-    "Case1": {
-        "batch": 256,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 128,
-        "block_size": 128,
-        "context_len": 8100,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-    "Case2": {
-        "batch": 64,
-        "num_heads": 64,
-        "kv_head_num": 1,
-        "head_dim": 128,
-        "block_size": 64,
-        "context_len": 8150,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-}
-
-DEFAULT_CASE = "Case1"
-
-
-def generate_inputs(params: dict) -> list:
-    return _generate_inputs(params)
-
-
-if __name__ == "__main__":
-    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/kernel_config.py b/tests/st/a5/host_build_graph/paged_attention/kernels/kernel_config.py
deleted file mode 100644
index 188d983a9..000000000
--- a/tests/st/a5/host_build_graph/paged_attention/kernels/kernel_config.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Paged Attention Kernel and Orchestration Configuration
-
-Defines the kernels and orchestration function for paged attention
-with AIC/AIV subgraph splitting:
-
-AIC Kernels (Matrix Multiplication):
-  - aic_qk_matmul: Q @ K^T computation
-  - aic_pv_matmul: P @ V computation
-
-AIV Kernels (Vector Operations):
-  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
-  - aiv_online_update: online softmax accumulation + fused normalization
-
-Note: aiv_normalize has been merged into aiv_online_update for efficiency.
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-_KERNELS_ROOT = Path(__file__).parent
-
-# Orchestration config
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
-    "function_name": "build_paged_attention_graph",
-    "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-}
-
-# Kernel configs (aiv_normalize removed - merged into aiv_online_update)
-KERNELS = [
-    # AIC kernels (matrix multiplication using Cube unit)
-    {
-        "func_id": 0,
-        "name": "QK",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 2,
-        "name": "PV",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    # AIV kernels (vector operations)
-    {
-        "func_id": 1,
-        "name": "SF",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-    },
-    {
-        "func_id": 3,
-        "name": "UP",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-    },
-]
-
-# Runtime configuration
-RUNTIME_CONFIG = {
-    "runtime": "host_build_graph",
-    "aicpu_thread_num": 3,
-    "block_dim": 24,
-}
diff --git a/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py
new file mode 100644
index 000000000..2d3b12d3b
--- /dev/null
+++ b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention — host_build_graph test (production scale, bfloat16).
+
+AIC+AIV mixed execution with online softmax paged attention.
+Production-scale cases for A5 hardware validation.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="host_build_graph")
+class TestPagedAttentionHostBuildGraph(SceneTestCase):
+    """Paged attention with host_build_graph runtime on A5."""
+
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "build_paged_attention_graph",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 3, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8100,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 3, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8150,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/golden.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/golden.py
deleted file mode 100644
index 86d5ccb9f..000000000
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/golden.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged Attention Golden - tensormap_and_ringbuffer test (production scale, bfloat16)."""
-
-from simpler_setup.goldens.paged_attention import (
-    compute_golden,  # noqa: F401
-    run_golden_test,
-)
-from simpler_setup.goldens.paged_attention import generate_inputs as _generate_inputs
-
-__outputs__ = ["out"]
-
-RTOL = 1e-3
-ATOL = 1e-3
-
-ALL_CASES = {
-    "Case1": {
-        "batch": 256,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 128,
-        "block_size": 128,
-        "context_len": 8192,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-    "Case2": {
-        "batch": 64,
-        "num_heads": 64,
-        "kv_head_num": 1,
-        "head_dim": 128,
-        "block_size": 64,
-        "context_len": 8192,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-    "Case3": {
-        "batch": 64,
-        "num_heads": 64,
-        "kv_head_num": 1,
-        "head_dim": 256,
-        "block_size": 64,
-        "context_len": 8192,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-}
-
-DEFAULT_CASE = "Case1"
-
-
-def generate_inputs(params: dict) -> list:
-    return _generate_inputs(params)
-
-
-if __name__ == "__main__":
-    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs)
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
deleted file mode 100644
index 415af4dee..000000000
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/kernel_config.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Paged Attention Kernel and Orchestration Configuration
-
-Defines the kernels and orchestration function for paged attention
-with AIC/AIV subgraph splitting:
-
-AIC Kernels (Matrix Multiplication):
-  - aic_qk_matmul: Q @ K^T computation
-  - aic_pv_matmul: P @ V computation
-
-AIV Kernels (Vector Operations):
-  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
-  - aiv_online_update: online softmax accumulation + fused normalization
-
-Note: aiv_normalize has been merged into aiv_online_update for efficiency.
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-_KERNELS_ROOT = Path(__file__).parent
-
-# Orchestration config
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
-    "function_name": "build_paged_attention_graph",
-    "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-}
-
-# Kernel configs (aiv_normalize removed - merged into aiv_online_update)
-KERNELS = [
-    # AIC kernels (matrix multiplication using Cube unit)
-    {
-        "func_id": 0,
-        "name": "QK",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 2,
-        "name": "PV",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    # AIV kernels (vector operations)
-    {
-        "func_id": 1,
-        "name": "SF",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-    },
-    {
-        "func_id": 3,
-        "name": "UP",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-    },
-]
-
-# Runtime configuration
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 24,
-}
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
new file mode 100644
index 000000000..4e3a52890
--- /dev/null
+++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
@@ -0,0 +1,134 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention — tensormap_and_ringbuffer test (production scale, bfloat16).
+
+AIC+AIV mixed execution with online softmax paged attention.
+Production-scale cases for A5 hardware validation.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPagedAttention(SceneTestCase):
+    """Paged attention with tensormap_and_ringbuffer runtime on A5."""
+
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "build_paged_attention_graph",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case3",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 256,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/golden.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/golden.py
deleted file mode 100644
index 4bbbe98ad..000000000
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/golden.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged Attention Unroll Golden - tensormap_and_ringbuffer test (production scale, bfloat16)."""
-
-from simpler_setup.goldens.paged_attention import (
-    compute_golden,  # noqa: F401  # re-exported for ci.py's dynamic golden-module loader
-    run_golden_test,
-)
-from simpler_setup.goldens.paged_attention import generate_inputs as _generate_inputs
-
-__outputs__ = ["out"]
-
-RTOL = 1e-3
-ATOL = 1e-3
-
-ALL_CASES = {
-    "Case1": {
-        "batch": 256,
-        "num_heads": 16,
-        "kv_head_num": 1,
-        "head_dim": 128,
-        "block_size": 128,
-        "context_len": 8192,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-    "Case2": {
-        "batch": 64,
-        "num_heads": 64,
-        "kv_head_num": 1,
-        "head_dim": 128,
-        "block_size": 64,
-        "context_len": 8192,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-    "Case3": {
-        "batch": 64,
-        "num_heads": 64,
-        "kv_head_num": 1,
-        "head_dim": 256,
-        "block_size": 64,
-        "context_len": 8192,
-        "max_model_len": 32768,
-        "dtype": "bfloat16",
-    },
-}
-
-DEFAULT_CASE = "Case1"
-
-
-def generate_inputs(params: dict) -> list:
-    return _generate_inputs(params)
-
-
-if __name__ == "__main__":
-    run_golden_test(ALL_CASES, DEFAULT_CASE, generate_inputs, label="Paged Attention Unroll")
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py
deleted file mode 100644
index 5d51b4917..000000000
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/kernel_config.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Paged Attention Kernel and Orchestration Configuration
-
-Defines the kernels and orchestration function for paged attention
-with AIC/AIV subgraph splitting:
-
-AIC Kernels (Matrix Multiplication):
-  - aic_qk_matmul: Q @ K^T computation
-  - aic_pv_matmul: P @ V computation
-
-AIV Kernels (Vector Operations):
-  - aiv_softmax_prepare: scale, rowmax, exp, rowsum
-  - aiv_online_update: online softmax accumulation + fused normalization
-
-Note: aiv_normalize has been merged into aiv_online_update for efficiency.
-"""
-
-from pathlib import Path
-
-from simpler.task_interface import ArgDirection as D  # pyright: ignore[reportAttributeAccessIssue]
-
-_KERNELS_ROOT = Path(__file__).parent
-
-# Orchestration config
-ORCHESTRATION = {
-    "source": str(_KERNELS_ROOT / "orchestration" / "paged_attention_orch.cpp"),
-    "function_name": "build_paged_attention_graph",
-    "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-}
-
-# Kernel configs (aiv_normalize removed - merged into aiv_online_update)
-KERNELS = [
-    # AIC kernels (matrix multiplication using Cube unit)
-    {
-        "func_id": 0,
-        "name": "QK",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_qk_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    {
-        "func_id": 2,
-        "name": "PV",
-        "source": str(_KERNELS_ROOT / "aic" / "aic_pv_matmul.cpp"),
-        "core_type": "aic",
-        "signature": [D.IN, D.IN, D.OUT],
-    },
-    # AIV kernels (vector operations)
-    {
-        "func_id": 1,
-        "name": "SF",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_softmax_prepare.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-    },
-    {
-        "func_id": 3,
-        "name": "UP",
-        "source": str(_KERNELS_ROOT / "aiv" / "aiv_online_update.cpp"),
-        "core_type": "aiv",
-        "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-    },
-]
-
-# Runtime configuration
-RUNTIME_CONFIG = {
-    "runtime": "tensormap_and_ringbuffer",
-    "aicpu_thread_num": 4,
-    "block_dim": 36,
-}
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py
new file mode 100644
index 000000000..f79a98c0d
--- /dev/null
+++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py
@@ -0,0 +1,133 @@
+#!/usr/bin/env python3
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""Paged attention unroll — tensormap_and_ringbuffer test (production scale, bfloat16).
+
+Same algorithm as paged_attention but with higher block_dim for unrolled dispatch.
+"""
+
+import torch
+from simpler.task_interface import ArgDirection as D
+
+from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
+from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
+from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
+
+
+@scene_test(level=2, runtime="tensormap_and_ringbuffer")
+class TestPagedAttentionUnroll(SceneTestCase):
+    """Paged attention unroll with tensormap_and_ringbuffer runtime on A5."""
+
+    RTOL = 1e-3
+    ATOL = 1e-3
+
+    CALLABLE = {
+        "orchestration": {
+            "source": "kernels/orchestration/paged_attention_orch.cpp",
+            "function_name": "build_paged_attention_graph",
+            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
+        },
+        "incores": [
+            {
+                "func_id": 0,
+                "source": "kernels/aic/aic_qk_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 2,
+                "source": "kernels/aic/aic_pv_matmul.cpp",
+                "core_type": "aic",
+                "signature": [D.IN, D.IN, D.OUT],
+            },
+            {
+                "func_id": 1,
+                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
+            },
+            {
+                "func_id": 3,
+                "source": "kernels/aiv/aiv_online_update.cpp",
+                "core_type": "aiv",
+                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
+            },
+        ],
+    }
+
+    CASES = [
+        {
+            "name": "Case1",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 36},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 36},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case3",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 36},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 256,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+    ]
+
+    def generate_args(self, params):
+        inputs = _pa_generate_inputs(params)
+        specs = []
+        for name, val in inputs:
+            if isinstance(val, torch.Tensor):
+                specs.append(Tensor(name, val))
+            else:
+                specs.append(Scalar(name, val))
+        return TaskArgsBuilder(*specs)
+
+    def compute_golden(self, args, params):
+        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
+        _pa_compute_golden(tensors, params)
+        for s in args.specs:
+            if isinstance(s, Tensor) and s.name in tensors:
+                getattr(args, s.name)[:] = tensors[s.name]
+
+
+if __name__ == "__main__":
+    SceneTestCase.run_module(__name__)

From 3f8119c5a90a760fd2be31bcbb4df3593941894b Mon Sep 17 00:00:00 2001
From: majin0824 <majin15@huawei.com>
Date: Thu, 16 Apr 2026 10:13:43 +0800
Subject: [PATCH 2/5] Refactor: migrate remaining TMR examples to tests/st and
 upgrade paged attention

- Move spmd_*, mixed_example from examples/tmr/ to tests/st/tmr/
- Remove duplicate HBG paged_attention from examples/ (already in tests/st/)
- Remove old TMR paged_attention from tests/st/ (kept in examples/ as evolving reference)
- Upgrade TMR paged_attention: fp16 -> bfloat16, multi-tile dispatch (16x128, 64x64),
  production-scale cases (batch=256, head_dim=128/256), tighter tolerances (1e-3)
- Add small-tile (16,16,16) dispatch path to HBG paged_attention kernels
  with SmallCase1/SmallCase2 sim-compatible test cases
---
 .../kernels/aic/aic_pv_matmul.cpp             | 101 -------
 .../kernels/aic/aic_qk_matmul.cpp             | 102 -------
 .../kernels/aiv/aiv_online_update.cpp         | 230 --------------
 .../kernels/aiv/aiv_softmax_prepare.cpp       | 110 -------
 .../orchestration/paged_attention_orch.cpp    | 252 ----------------
 .../paged_attention/test_paged_attention.py   | 118 --------
 .../kernels/aic/aic_pv_matmul.cpp             |  46 +--
 .../kernels/aic/aic_qk_matmul.cpp             |  44 ++-
 .../kernels/aiv/aiv_online_update.cpp         | 209 +++++++------
 .../kernels/aiv/aiv_softmax_prepare.cpp       |  66 ++--
 .../orchestration/paged_attention_orch.cpp    | 157 ++++++++--
 .../paged_attention/test_paged_attention.py   |  71 ++++-
 .../paged_attention/README.md                 |  73 +++--
 .../kernels/aic/aic_pv_matmul.cpp             |   7 +-
 .../kernels/aic/aic_qk_matmul.cpp             |   7 +-
 .../kernels/aiv/aiv_online_update.cpp         |   6 +-
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   6 +-
 .../paged_attention/test_paged_attention.py   |  31 ++
 .../kernels/aic/kernel_matmul.cpp             |   0
 .../mixed_example/kernels/aiv/kernel_add.cpp  |   0
 .../kernels/aiv/kernel_add_standalone.cpp     |   0
 .../mixed_example/kernels/aiv/kernel_mul.cpp  |   0
 .../kernels/aiv/kernel_mul_standalone.cpp     |   0
 .../kernels/orchestration/mixed_orch.cpp      |   0
 .../mixed_example/test_mixed_example.py       |   0
 .../kernels/aic/aic_pv_matmul.cpp             | 112 -------
 .../kernels/aic/aic_qk_matmul.cpp             | 113 -------
 .../kernels/aiv/aiv_online_update.cpp         | 246 ---------------
 .../kernels/aiv/aiv_softmax_prepare.cpp       | 146 ---------
 .../orchestration/paged_attention_orch.cpp    | 281 ------------------
 .../paged_attention/test_paged_attention.py   | 134 ---------
 .../kernels/aic/kernel_spmd_read.cpp          |   0
 .../kernels/aiv/kernel_spmd_read.cpp          |   0
 .../kernels/orchestration/spmd_basic_orch.cpp |   0
 .../spmd_basic/test_spmd_basic.py             |   0
 .../kernels/aiv/kernel_spmd_write.cpp         |   0
 .../spmd_multiblock_aiv_orch.cpp              |   0
 .../test_spmd_multiblock_aiv.py               |   0
 .../kernels/aic/kernel_spmd_mix.cpp           |   0
 .../kernels/aiv/kernel_spmd_mix.cpp           |   0
 .../spmd_multiblock_mix_orch.cpp              |   0
 .../test_spmd_multiblock_mix.py               |   0
 .../orchestration/spmd_starvation_orch.cpp    |   0
 .../spmd_starvation/test_spmd_starvation.py   |   0
 .../orchestration/spmd_sync_start_orch.cpp    |   0
 .../spmd_sync_start/test_spmd_sync_start.py   |   0
 .../spmd_sync_start_aiv_orch.cpp              |   0
 .../test_spmd_sync_start_aiv.py               |   0
 .../spmd_sync_start_edge_orch.cpp             |   0
 .../test_spmd_sync_start_edge.py              |   0
 .../spmd_sync_start_stress_orch.cpp           |   0
 .../test_spmd_sync_start_stress.py            |   0
 52 files changed, 483 insertions(+), 2185 deletions(-)
 delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
 delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
 delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
 delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
 delete mode 100644 examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
 delete mode 100644 examples/a5/host_build_graph/paged_attention/test_paged_attention.py
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py (100%)
 delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
 delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
 delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
 delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
 delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
 delete mode 100644 tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aic/kernel_spmd_read.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aiv/kernel_spmd_read.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp (100%)
 rename {examples => tests/st}/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py (100%)

diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
deleted file mode 100644
index 75aa44e5b..000000000
--- a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N)
-//
-// Fixed tile size: (16, 16) @ (16, 16) -> (16, 16)
-//
-// pij is float16 (converted from fp32 in softmax_prepare via TCVT).
-// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
-// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-static __aicore__ void pv_matmul_impl(__gm__ uint8_t *pij_raw, __gm__ uint8_t *vj_raw, __gm__ uint8_t *oi_raw) {
-    constexpr int M = 16, K = 16, N = 16;
-
-    __gm__ half *pij = reinterpret_cast<__gm__ half *>(pij_raw);
-    __gm__ half *vj = reinterpret_cast<__gm__ half *>(vj_raw);
-    __gm__ float *oi = reinterpret_cast<__gm__ float *>(oi_raw);
-
-    // pij (M, K) fp16, vj (K, N) fp16 in ND (row-major), oi_new (M, N) fp32
-    using GlobalA = GlobalTensor<half, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
-    using GlobalB = GlobalTensor<half, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, N, 1>>;
-    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M * N, M * N, M * N, N, 1>>;
-
-    GlobalA pijGlobal(pij);
-    GlobalB vjGlobal(vj);
-    GlobalOut oiGlobal(oi);
-
-    // L1 Mat tiles: standard ND pattern for both A and B
-    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
-
-    // L0 tiles
-    using LeftTile = TileLeft<half, M, K, M, K>;
-    using RightTile = TileRight<half, K, N, K, N>;
-    using AccTile = TileAcc<float, M, N, M, N>;
-
-    TileMatA aMatTile;
-    TileMatB bMatTile;
-    TASSIGN(aMatTile, 0x0);
-    TASSIGN(bMatTile, 0x20000);
-
-    LeftTile aTile;
-    RightTile bTile;
-    AccTile cTile;
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x0);
-    TASSIGN(cTile, 0x0);
-
-    // Load pij and vj to L1
-    TLOAD(aMatTile, pijGlobal);
-    TLOAD(bMatTile, vjGlobal);
-
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-    // Move to L0A/L0B
-    TMOV(aTile, aMatTile);
-    TMOV(bTile, bMatTile);
-
-    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-    // Single matmul: (M,K) x (K,N) -> (M,N)
-    TMATMUL(cTile, aTile, bTile);
-
-    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-    TSTORE(oiGlobal, cTile);
-
-    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ uint8_t *pij = reinterpret_cast<__gm__ uint8_t *>(args[0]);
-    __gm__ uint8_t *vj = reinterpret_cast<__gm__ uint8_t *>(args[1]);
-    __gm__ uint8_t *oi_new = reinterpret_cast<__gm__ uint8_t *>(args[2]);
-
-    pv_matmul_impl(pij, vj, oi_new);
-}
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
deleted file mode 100644
index 6322ee6ab..000000000
--- a/examples/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N)
-//
-// Fixed tile size: (16, 16) @ (16, 16).T -> (16, 16)
-//
-// kj is stored as (N, K) = (block_size, head_dim) in row-major memory.
-// This is equivalent to (K, N) in column-major (DN) layout.
-// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-static __aicore__ void qk_matmul_impl(__gm__ uint8_t *qi_raw, __gm__ uint8_t *kj_raw, __gm__ uint8_t *sij_raw) {
-    constexpr int M = 16, K = 16, N = 16;
-
-    __gm__ half *qi = reinterpret_cast<__gm__ half *>(qi_raw);
-    __gm__ half *kj = reinterpret_cast<__gm__ half *>(kj_raw);
-    __gm__ float *sij = reinterpret_cast<__gm__ float *>(sij_raw);
-
-    // qi (M, K) fp16 in ND (row-major) layout
-    using GlobalA = GlobalTensor<half, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
-    // kj stored as (N, K) row-major = (K, N) column-major -> DN layout
-    using GlobalB = GlobalTensor<half, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
-    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M * N, M * N, M * N, N, 1>>;
-
-    GlobalA qiGlobal(qi);
-    GlobalB kjGlobal(kj);
-    GlobalOut sijGlobal(sij);
-
-    // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
-    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
-
-    // L0 tiles
-    using LeftTile = TileLeft<half, M, K, M, K>;
-    using RightTile = TileRight<half, K, N, K, N>;
-    using AccTile = TileAcc<float, M, N, M, N>;
-
-    TileMatA aMatTile;
-    TileMatB bMatTile;
-    TASSIGN(aMatTile, 0x0);
-    TASSIGN(bMatTile, 0x20000);
-
-    LeftTile aTile;
-    RightTile bTile;
-    AccTile cTile;
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x0);
-    TASSIGN(cTile, 0x0);
-
-    // Load qi and kj to L1
-    TLOAD(aMatTile, qiGlobal);
-    TLOAD(bMatTile, kjGlobal);
-
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-    // Move to L0A/L0B
-    TMOV(aTile, aMatTile);
-    TMOV(bTile, bMatTile);
-
-    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-    // Single matmul: (M,K) x (K,N) -> (M,N)
-    TMATMUL(cTile, aTile, bTile);
-
-    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-    TSTORE(sijGlobal, cTile);
-
-    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ uint8_t *qi = reinterpret_cast<__gm__ uint8_t *>(args[0]);
-    __gm__ uint8_t *kj = reinterpret_cast<__gm__ uint8_t *>(args[1]);
-    __gm__ uint8_t *sij = reinterpret_cast<__gm__ uint8_t *>(args[2]);
-
-    qk_matmul_impl(qi, kj, sij);
-}
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
deleted file mode 100644
index 5563b36ff..000000000
--- a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
+++ /dev/null
@@ -1,230 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// Online Softmax Update + Normalize Kernel (AIV)
-//
-// Fixed tile size: oi/oi_new are (16, 16), mij/lij/mi/li are 16-element vectors
-//
-// Scalar layout strategy:
-//   M scalar floats stored contiguously in GM can be loaded as either:
-//   - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD)
-//   - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV)
-//   Conversion between layouts uses GM round-trip: ND TSTORE -> DN TLOAD.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-static __aicore__ void online_update_impl(
-    __gm__ uint8_t *mij_raw, __gm__ uint8_t *lij_raw, __gm__ uint8_t *oi_new_raw, __gm__ uint8_t *mi_raw,
-    __gm__ uint8_t *li_raw, __gm__ uint8_t *oi_raw, int is_first, int is_last, __gm__ uint8_t *dst_raw
-) {
-    constexpr int M = 16, N = 16;
-
-    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij_raw);
-    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij_raw);
-    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new_raw);
-    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi_raw);
-    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li_raw);
-    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi_raw);
-    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst_raw);
-
-    // Scalar tile dimensions for RowMajor layout:
-    // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block)
-    // kScalarRows = M / 8 (M=16 -> 2 rows)
-    constexpr int kScalarCols = 32 / sizeof(float);
-    constexpr int kScalarRows = M / kScalarCols;
-    // Aligned rows for ColMajor DN tiles (32-byte alignment)
-    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
-
-    // --- GlobalTensor types ---
-
-    // Data (M, N) RowMajor
-    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
-
-    // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor
-    using GlobalScalarND =
-        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, pto::Stride<1, 1, 1, kScalarCols, 1>>;
-
-    // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
-
-    // --- GlobalTensor instances ---
-
-    GlobalDataMxN oiNewGlobal(oi_new_ptr);
-    GlobalDataMxN oiGlobal(oi_ptr);
-    GlobalDataMxN dstGlobal(dst_ptr);
-
-    // ND globals for scalar element-wise operations
-    GlobalScalarND mijGlobalND(mij_ptr);
-    GlobalScalarND lijGlobalND(lij_ptr);
-    GlobalScalarND miGlobalND(mi_ptr);
-    GlobalScalarND liGlobalND(li_ptr);
-
-    // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE)
-    GlobalScalarDN mijGlobalDN(mij_ptr);
-    GlobalScalarDN lijGlobalDN(lij_ptr);
-    GlobalScalarDN liGlobalDN(li_ptr);
-
-    // --- Tile types ---
-
-    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
-    using TileScalarND =
-        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
-
-    // --- UB memory layout ---
-
-    constexpr int kDataBytes = M * N * sizeof(float);
-    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
-    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
-
-    // Data tiles
-    TileDataMxN oiNewTile;
-    TileDataMxN oiTile;
-
-    // Scalar ND tiles for element-wise arithmetic
-    TileScalarND mijND, lijND, miND, liND;
-    TileScalarND miNewND, alphaND, betaND, tmpND;
-
-    // Scalar DN tiles for TROWEXPAND operations
-    TileScalarDN alphaDN, betaDN, liDN;
-
-    TASSIGN(oiNewTile, 0);
-    TASSIGN(oiTile, kDataBytes);
-    TASSIGN(mijND, 2 * kDataBytes);
-    TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes);
-    TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes);
-    TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes);
-    TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes);
-    TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes);
-    TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes);
-    TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes);
-    TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes);
-    TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes);
-    TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes);
-
-    if (is_first) {
-        // --- First block: copy inputs to accumulators ---
-        TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(mijND, mijGlobalND);
-        TLOAD(lijND, lijGlobalND);
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        // Passthrough to MTE3 (no V compute needed)
-        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        TSTORE(miGlobalND, mijND);    // mi = mij
-        TSTORE(liGlobalND, lijND);    // li = lij
-        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
-
-        if (is_last) {
-            // Single block: normalize dst = oi_new / lij
-            // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            TLOAD(liDN, liGlobalDN);
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-            TROWEXPANDDIV(oiNewTile, oiNewTile, liDN);
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            TSTORE(dstGlobal, oiNewTile);
-        }
-    } else {
-        // --- Subsequent blocks: accumulate ---
-
-        // Phase 1: Load all inputs
-        TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(oiTile, oiGlobal);
-        TLOAD(mijND, mijGlobalND);
-        TLOAD(lijND, lijGlobalND);
-        TLOAD(miND, miGlobalND);
-        TLOAD(liND, liGlobalND);
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols)
-        TMAX(miNewND, miND, mijND);    // mi_new = max(mi, mij)
-        TSUB(alphaND, miND, miNewND);  // alpha = mi - mi_new
-        TEXP(alphaND, alphaND);        // alpha = exp(mi - mi_new)
-        TSUB(betaND, mijND, miNewND);  // beta = mij - mi_new
-        TEXP(betaND, betaND);          // beta = exp(mij - mi_new)
-        TMUL(liND, alphaND, liND);     // li = alpha * li
-        TMUL(tmpND, betaND, lijND);    // tmp = beta * lij
-        TADD(liND, liND, tmpND);       // li = alpha * li + beta * lij (= li_new)
-
-        // Phase 3: Store scalar results to GM (ND format)
-        // mi_new -> mi accumulator, li_new -> li accumulator
-        // alpha -> mij buffer (reuse), beta -> lij buffer (reuse)
-        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        TSTORE(miGlobalND, miNewND);   // persist mi_new
-        TSTORE(liGlobalND, liND);      // persist li_new
-        TSTORE(mijGlobalND, alphaND);  // temp: alpha to mij buffer
-        TSTORE(lijGlobalND, betaND);   // temp: beta to lij buffer
-
-        // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN
-        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        TLOAD(alphaDN, mijGlobalDN);  // alpha from mij buffer as DN
-        TLOAD(betaDN, lijGlobalDN);   // beta from lij buffer as DN
-        if (is_last) {
-            TLOAD(liDN, liGlobalDN);  // li_new from li buffer as DN
-        }
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-
-        // Phase 5: Scale data tiles using row-broadcast multiply
-        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
-        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
-        TADD(oiTile, oiTile, oiNewTile);              // oi = alpha*oi + beta*oi_new
-
-        if (is_last) {
-            // Phase 6: Normalize and output
-            TROWEXPANDDIV(oiTile, oiTile, liDN);  // dst = oi / li_new
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            TSTORE(dstGlobal, oiTile);
-        } else {
-            // Phase 6: Store updated accumulators
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            TSTORE(oiGlobal, oiTile);
-        }
-    }
-
-    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ uint8_t *mij = reinterpret_cast<__gm__ uint8_t *>(args[0]);
-    __gm__ uint8_t *lij = reinterpret_cast<__gm__ uint8_t *>(args[1]);
-    __gm__ uint8_t *oi_new = reinterpret_cast<__gm__ uint8_t *>(args[2]);
-    __gm__ uint8_t *mi = reinterpret_cast<__gm__ uint8_t *>(args[3]);
-    __gm__ uint8_t *li = reinterpret_cast<__gm__ uint8_t *>(args[4]);
-    __gm__ uint8_t *oi = reinterpret_cast<__gm__ uint8_t *>(args[5]);
-    int is_first = static_cast<int>(args[6]);
-    int is_last = static_cast<int>(args[7]);
-    __gm__ uint8_t *dst = reinterpret_cast<__gm__ uint8_t *>(args[8]);
-
-    online_update_impl(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
-}
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
deleted file mode 100644
index c07ca22a1..000000000
--- a/examples/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// Softmax Preparation Kernel (AIV)
-//
-// Fixed tile size: sij is (16, 16)
-//
-// Computes:
-//   sij_scale = sij * scale
-//   mij = row_max(sij_scale)        -> (M, 1)
-//   pij = exp(sij_scale - mij)      -> (M, N)
-//   lij = row_sum(pij)              -> (M, 1)
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-static __aicore__ void softmax_prepare_impl(
-    __gm__ uint8_t *sij_raw, float scale_value, __gm__ uint8_t *pij_raw, __gm__ uint8_t *mij_raw,
-    __gm__ uint8_t *lij_raw
-) {
-    constexpr int M = 16, N = 16;
-
-    __gm__ float *sij = reinterpret_cast<__gm__ float *>(sij_raw);
-    __gm__ half *pij = reinterpret_cast<__gm__ half *>(pij_raw);
-    __gm__ float *mij = reinterpret_cast<__gm__ float *>(mij_raw);
-    __gm__ float *lij = reinterpret_cast<__gm__ float *>(lij_raw);
-
-    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
-
-    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
-    using GlobalDataMxN_f16 = GlobalTensor<half, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
-
-    GlobalDataMxN sijGlobal(sij);
-    GlobalDataMxN_f16 pijGlobal(pij);
-    GlobalScalarDN mijGlobal(mij);
-    GlobalScalarDN lijGlobal(lij);
-
-    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
-    using TileVecMxN_f16 = Tile<TileType::Vec, half, M, N, BLayout::RowMajor, M, N>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
-
-    TileVecMxN sijTile;
-    TileVecMxN pijTile;
-    TileVecMxN tmpTile;
-    TileScalarDN maxTile;
-    TileScalarDN sumTile;
-    TileVecMxN_f16 pijF16Tile;
-
-    TASSIGN(sijTile, 0x0);
-    TASSIGN(pijTile, M * N * sizeof(float));
-    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
-    TASSIGN(maxTile, 3 * M * N * sizeof(float));
-    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
-    TASSIGN(pijF16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
-
-    TLOAD(sijTile, sijGlobal);
-    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-    TMULS(sijTile, sijTile, scale_value);
-    TROWMAX(maxTile, sijTile, tmpTile);
-    TROWEXPANDSUB(pijTile, sijTile, maxTile);
-    TEXP(pijTile, pijTile);
-    // Truncate pij to fp16 first, then compute lij from truncated values (matches golden)
-    TCVT(pijF16Tile, pijTile, RoundMode::CAST_ROUND);
-    TCVT(pijTile, pijF16Tile, RoundMode::CAST_ROUND);
-    TROWSUM(sumTile, pijTile, tmpTile);
-
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(mijGlobal, maxTile);
-    TSTORE(lijGlobal, sumTile);
-    TSTORE(pijGlobal, pijF16Tile);
-
-    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ uint8_t *sij = reinterpret_cast<__gm__ uint8_t *>(args[0]);
-    union {
-        uint64_t u;
-        float f;
-    } scale_conv;
-    scale_conv.u = static_cast<uint64_t>(args[1]);
-    float scale_value = scale_conv.f;
-    __gm__ uint8_t *pij = reinterpret_cast<__gm__ uint8_t *>(args[2]);
-    __gm__ uint8_t *mij = reinterpret_cast<__gm__ uint8_t *>(args[3]);
-    __gm__ uint8_t *lij = reinterpret_cast<__gm__ uint8_t *>(args[4]);
-
-    softmax_prepare_impl(sij, scale_value, pij, mij, lij);
-}
diff --git a/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
deleted file mode 100644
index 17dbd02ce..000000000
--- a/examples/a5/host_build_graph/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Paged Attention Orchestration - Small Scale (16x16)
- *
- * Supports small-scale paged attention with:
- *   Query: (batch, q_head_num, head_dim) fp16
- *   Key:   (total_blocks, block_size, kv_head_num, head_dim) fp16 (NOT transposed)
- *   Value: (total_blocks, block_size, kv_head_num, head_dim) fp16
- *   Output: (batch, q_head_num, head_dim) float32
- *
- * Head tiling: q_tile_size = min(num_heads, 128)
- * GQA: kv_head_num can differ from q_head_num
- *
- * ChipStorageTaskArgs layout: tensors=[query, key_cache, value_cache, block_table, context_lens, out], scalars=[scale]
- */
-
-#include <algorithm>
-#include <cstring>
-#include <iostream>
-
-#include "orchestration_api.h"  // NOLINT(build/include_subdir)
-
-#define FUNC_QK_MATMUL 0
-#define FUNC_SOFTMAX_PREPARE 1
-#define FUNC_PV_MATMUL 2
-#define FUNC_ONLINE_UPDATE 3
-
-extern "C" {
-
-int build_paged_attention_graph(OrchestrationRuntime *runtime, const ChipStorageTaskArgs &orch_args) {
-    if (orch_args.tensor_count() < 6) {
-        std::cerr << "Expected at least 6 tensors, got " << orch_args.tensor_count() << '\n';
-        return -1;
-    }
-
-    // Extract host pointers from tensor metadata
-    void *host_query = orch_args.tensor(0).data_as<void>();
-    void *host_key_cache = orch_args.tensor(1).data_as<void>();
-    void *host_value_cache = orch_args.tensor(2).data_as<void>();
-    int *host_block_table = orch_args.tensor(3).data_as<int>();
-    int *host_context_lens = orch_args.tensor(4).data_as<int>();
-    void *host_out = orch_args.tensor(5).data_as<void>();
-
-    // Extract sizes from tensor metadata
-    size_t query_size = orch_args.tensor(0).nbytes();
-    size_t key_cache_size = orch_args.tensor(1).nbytes();
-    size_t value_cache_size = orch_args.tensor(2).nbytes();
-    size_t out_size = orch_args.tensor(5).nbytes();
-
-    // Read dimensions from tensor shapes
-    // query: (batch, num_heads, head_dim)
-    uint32_t batch = orch_args.tensor(0).shapes[0];
-    uint32_t num_heads = orch_args.tensor(0).shapes[1];
-    uint32_t head_dim = orch_args.tensor(0).shapes[2];
-
-    // key_cache: (total_blocks, block_size, kv_head_num, head_dim)
-    uint32_t block_size = orch_args.tensor(1).shapes[1];
-    uint32_t kv_head_num = orch_args.tensor(1).shapes[2];
-
-    // block_table: (batch, max_num_blocks_per_req)
-    uint32_t max_num_blocks = orch_args.tensor(3).shapes[1];
-
-    // scale: first scalar argument
-    uint64_t scale_value_bits = orch_args.scalar(0);
-
-    uint32_t q_tile_size = std::min(num_heads, 128u);
-    uint32_t num_head_tiles = (num_heads + q_tile_size - 1) / q_tile_size;
-
-    std::cout << "\n=== build_paged_attention_graph ===" << '\n';
-    std::cout << "batch=" << batch << ", num_heads=" << num_heads << ", kv_head_num=" << kv_head_num
-              << ", head_dim=" << head_dim << '\n';
-    std::cout << "block_size=" << block_size << ", max_num_blocks=" << max_num_blocks << '\n';
-    std::cout << "q_tile_size=" << q_tile_size << ", num_head_tiles=" << num_head_tiles << '\n';
-
-    // Allocate device memory for inputs/outputs
-    void *dev_query = device_malloc(runtime, query_size);
-    void *dev_key_cache = device_malloc(runtime, key_cache_size);
-    void *dev_value_cache = device_malloc(runtime, value_cache_size);
-    void *dev_out = device_malloc(runtime, out_size);
-
-    if (!dev_query || !dev_key_cache || !dev_value_cache || !dev_out) {
-        std::cerr << "Error: Failed to allocate device memory\n";
-        return -1;
-    }
-
-    copy_to_device(runtime, dev_query, host_query, query_size);
-    copy_to_device(runtime, dev_key_cache, host_key_cache, key_cache_size);
-    copy_to_device(runtime, dev_value_cache, host_value_cache, value_cache_size);
-    record_tensor_pair(runtime, host_out, dev_out, out_size);
-
-    // Buffer sizes depend on q_tile_size and block_size
-    size_t sij_size = static_cast<size_t>(q_tile_size) * block_size * sizeof(float);
-    size_t pij_size = static_cast<size_t>(q_tile_size) * block_size * sizeof(uint16_t);
-    size_t mij_size = static_cast<size_t>(q_tile_size) * sizeof(float);
-    size_t lij_size = mij_size;
-    size_t oi_new_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
-
-    // Per-batch-per-block intermediate buffers
-    uint32_t total_buffers = batch * max_num_blocks;
-    void **dev_sij_arr = new void *[total_buffers];
-    void **dev_pij_arr = new void *[total_buffers];
-    void **dev_mij_arr = new void *[total_buffers];
-    void **dev_lij_arr = new void *[total_buffers];
-    void **dev_oi_new_arr = new void *[total_buffers];
-
-    for (uint32_t i = 0; i < total_buffers; i++) {
-        dev_sij_arr[i] = device_malloc(runtime, sij_size);
-        dev_pij_arr[i] = device_malloc(runtime, pij_size);
-        dev_mij_arr[i] = device_malloc(runtime, mij_size);
-        dev_lij_arr[i] = device_malloc(runtime, lij_size);
-        dev_oi_new_arr[i] = device_malloc(runtime, oi_new_size);
-    }
-
-    // Per-(batch, head_tile) accumulators
-    uint32_t total_accums = batch * num_head_tiles;
-    size_t mi_size = static_cast<size_t>(q_tile_size) * sizeof(float);
-    size_t li_size = mi_size;
-    size_t oi_size = static_cast<size_t>(q_tile_size) * head_dim * sizeof(float);
-
-    void **dev_mi_arr = new void *[total_accums];
-    void **dev_li_arr = new void *[total_accums];
-    void **dev_oi_arr = new void *[total_accums];
-
-    for (uint32_t i = 0; i < total_accums; i++) {
-        dev_mi_arr[i] = device_malloc(runtime, mi_size);
-        dev_li_arr[i] = device_malloc(runtime, li_size);
-        dev_oi_arr[i] = device_malloc(runtime, oi_size);
-    }
-
-    std::cout << "Allocated " << total_buffers << " per-block buffers\n";
-    std::cout << "Allocated " << total_accums << " per-(batch,head_tile) accumulators\n";
-
-    int total_tasks = 0;
-
-    for (uint32_t b_idx = 0; b_idx < batch; b_idx++) {
-        int cur_seq = host_context_lens[b_idx];
-        uint32_t bn_this_batch = (static_cast<uint32_t>(cur_seq) + block_size - 1) / block_size;
-
-        for (uint32_t ht = 0; ht < num_head_tiles; ht++) {
-            uint32_t cur_offset = ht * q_tile_size;
-
-            // Query: (batch, q_head_num, head_dim) fp16
-            // qi points to heads [cur_offset .. cur_offset+q_tile_size) for batch b_idx
-            uint8_t *qi_ptr = reinterpret_cast<uint8_t *>(dev_query) +
-                              static_cast<int64_t>(b_idx * num_heads + cur_offset) * head_dim * sizeof(uint16_t);
-
-            // Output: (batch * q_head_num, head_dim) float32
-            uint8_t *out_ptr = reinterpret_cast<uint8_t *>(dev_out) +
-                               static_cast<int64_t>(b_idx * num_heads + cur_offset) * head_dim * sizeof(float);
-
-            // GQA: which kv_head this head tile maps to
-            uint32_t kv_head_idx = cur_offset / (num_heads / kv_head_num);
-
-            // Per-(batch, head_tile) accumulators
-            uint32_t accum_idx = b_idx * num_head_tiles + ht;
-            void *dev_mi = dev_mi_arr[accum_idx];
-            void *dev_li = dev_li_arr[accum_idx];
-            void *dev_oi = dev_oi_arr[accum_idx];
-
-            int t_up_prev = -1;
-
-            for (uint32_t bn = 0; bn < bn_this_batch; bn++) {
-                int cur_block_idx = host_block_table[b_idx * max_num_blocks + bn];
-
-                // Key: (total_blocks, block_size, kv_head_num, head_dim) fp16
-                uint8_t *kj_ptr = reinterpret_cast<uint8_t *>(dev_key_cache) +
-                                  (static_cast<int64_t>(cur_block_idx) * block_size * kv_head_num + kv_head_idx) *
-                                      head_dim * sizeof(uint16_t);
-
-                // Value: (total_blocks, block_size, kv_head_num, head_dim) fp16
-                uint8_t *vj_ptr = reinterpret_cast<uint8_t *>(dev_value_cache) +
-                                  (static_cast<int64_t>(cur_block_idx) * block_size * kv_head_num + kv_head_idx) *
-                                      head_dim * sizeof(uint16_t);
-
-                uint32_t buf_idx = b_idx * max_num_blocks + bn;
-                void *dev_sij = dev_sij_arr[buf_idx];
-                void *dev_pij = dev_pij_arr[buf_idx];
-                void *dev_mij = dev_mij_arr[buf_idx];
-                void *dev_lij = dev_lij_arr[buf_idx];
-                void *dev_oi_new = dev_oi_new_arr[buf_idx];
-
-                // QK: qi(M, K) @ kj.T(K, N) -> sij(M, N)
-                uint64_t qk_args[6] = {reinterpret_cast<uint64_t>(qi_ptr),  reinterpret_cast<uint64_t>(kj_ptr),
-                                       reinterpret_cast<uint64_t>(dev_sij), static_cast<uint64_t>(q_tile_size),
-                                       static_cast<uint64_t>(head_dim),     static_cast<uint64_t>(block_size)};
-                int t_qk = add_task(runtime, qk_args, 6, FUNC_QK_MATMUL, CoreType::AIC);
-                total_tasks++;
-
-                // SF: scale, rowmax, exp, rowsum -> pij, mij, lij
-                uint64_t sf_args[7] = {reinterpret_cast<uint64_t>(dev_sij), scale_value_bits,
-                                       reinterpret_cast<uint64_t>(dev_pij), reinterpret_cast<uint64_t>(dev_mij),
-                                       reinterpret_cast<uint64_t>(dev_lij), static_cast<uint64_t>(q_tile_size),
-                                       static_cast<uint64_t>(block_size)};
-                int t_sf = add_task(runtime, sf_args, 7, FUNC_SOFTMAX_PREPARE, CoreType::AIV);
-                total_tasks++;
-
-                // PV: pij(M, K') @ vj(K', N') -> oi_new(M, N')
-                uint64_t pv_args[6] = {reinterpret_cast<uint64_t>(dev_pij),    reinterpret_cast<uint64_t>(vj_ptr),
-                                       reinterpret_cast<uint64_t>(dev_oi_new), static_cast<uint64_t>(q_tile_size),
-                                       static_cast<uint64_t>(block_size),      static_cast<uint64_t>(head_dim)};
-                int t_pv = add_task(runtime, pv_args, 6, FUNC_PV_MATMUL, CoreType::AIC);
-                total_tasks++;
-
-                add_successor(runtime, t_qk, t_sf);
-                add_successor(runtime, t_sf, t_pv);
-
-                // Online Update: serialized across blocks (each depends on previous)
-                int is_first = (bn == 0) ? 1 : 0;
-                int is_last = (bn == bn_this_batch - 1) ? 1 : 0;
-
-                uint64_t up_args[11] = {reinterpret_cast<uint64_t>(dev_mij),    reinterpret_cast<uint64_t>(dev_lij),
-                                        reinterpret_cast<uint64_t>(dev_oi_new), reinterpret_cast<uint64_t>(dev_mi),
-                                        reinterpret_cast<uint64_t>(dev_li),     reinterpret_cast<uint64_t>(dev_oi),
-                                        static_cast<uint64_t>(is_first),        static_cast<uint64_t>(is_last),
-                                        reinterpret_cast<uint64_t>(out_ptr),    static_cast<uint64_t>(q_tile_size),
-                                        static_cast<uint64_t>(head_dim)};
-                int t_up = add_task(runtime, up_args, 11, FUNC_ONLINE_UPDATE, CoreType::AIV);
-                total_tasks++;
-
-                add_successor(runtime, t_pv, t_up);
-                if (t_up_prev >= 0) {
-                    add_successor(runtime, t_up_prev, t_up);
-                }
-                t_up_prev = t_up;
-            }
-        }
-    }
-
-    delete[] dev_sij_arr;
-    delete[] dev_pij_arr;
-    delete[] dev_mij_arr;
-    delete[] dev_lij_arr;
-    delete[] dev_oi_new_arr;
-    delete[] dev_mi_arr;
-    delete[] dev_li_arr;
-    delete[] dev_oi_arr;
-
-    std::cout << "Created " << total_tasks << " tasks\n";
-    print_runtime(runtime);
-
-    return 0;
-}
-}
diff --git a/examples/a5/host_build_graph/paged_attention/test_paged_attention.py b/examples/a5/host_build_graph/paged_attention/test_paged_attention.py
deleted file mode 100644
index 7d72b6be1..000000000
--- a/examples/a5/host_build_graph/paged_attention/test_paged_attention.py
+++ /dev/null
@@ -1,118 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged attention — host_build_graph example (small scale, float16).
-
-AIC+AIV mixed execution with online softmax paged attention.
-Small-scale cases for quick validation on A5.
-"""
-
-import torch
-from simpler.task_interface import ArgDirection as D
-
-from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
-from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
-from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
-
-
-@scene_test(level=2, runtime="host_build_graph")
-class TestPagedAttention(SceneTestCase):
-    """Paged attention with host_build_graph runtime on A5."""
-
-    RTOL = 1e-2
-    ATOL = 1e-2
-
-    CALLABLE = {
-        "orchestration": {
-            "source": "kernels/orchestration/paged_attention_orch.cpp",
-            "function_name": "build_paged_attention_graph",
-            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-        },
-        "incores": [
-            {
-                "func_id": 0,
-                "source": "kernels/aic/aic_qk_matmul.cpp",
-                "core_type": "aic",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 2,
-                "source": "kernels/aic/aic_pv_matmul.cpp",
-                "core_type": "aic",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 1,
-                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-            },
-            {
-                "func_id": 3,
-                "source": "kernels/aiv/aiv_online_update.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-            },
-        ],
-    }
-
-    CASES = [
-        {
-            "name": "Case1",
-            "platforms": ["a5sim", "a5"],
-            "config": {"aicpu_thread_num": 3, "block_dim": 3},
-            "params": {
-                "batch": 1,
-                "num_heads": 16,
-                "kv_head_num": 1,
-                "head_dim": 16,
-                "block_size": 16,
-                "context_len": 16,
-                "max_model_len": 256,
-                "dtype": "float16",
-            },
-        },
-        {
-            "name": "Case2",
-            "platforms": ["a5sim", "a5"],
-            "config": {"aicpu_thread_num": 3, "block_dim": 3},
-            "manual": True,
-            "params": {
-                "batch": 1,
-                "num_heads": 16,
-                "kv_head_num": 1,
-                "head_dim": 16,
-                "block_size": 16,
-                "context_len": 64,
-                "max_model_len": 256,
-                "dtype": "float16",
-            },
-        },
-    ]
-
-    def generate_args(self, params):
-        inputs = _pa_generate_inputs(params)
-        specs = []
-        for name, val in inputs:
-            if isinstance(val, torch.Tensor):
-                specs.append(Tensor(name, val))
-            else:
-                specs.append(Scalar(name, val))
-        return TaskArgsBuilder(*specs)
-
-    def compute_golden(self, args, params):
-        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
-        _pa_compute_golden(tensors, params)
-        for s in args.specs:
-            if isinstance(s, Tensor) and s.name in tensors:
-                getattr(args, s.name)[:] = tensors[s.name]
-
-
-if __name__ == "__main__":
-    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
index c2800abcb..c6e04d559 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -10,9 +10,11 @@
  */
 // PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N)
 //
-// Fixed tile size: (16, 16) @ (16, 16) -> (16, 16)
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128) -> (16, 128)
+//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
 //
-// pij is float16 (converted from fp32 in softmax_prepare via TCVT).
+// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT).
 // vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
 // Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB.
 
@@ -33,13 +35,13 @@ using namespace pto;
 
 template <int M, int K, int N>
 static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) {
-    __gm__ half *pij_addr = reinterpret_cast<__gm__ half *>(pij->buffer.addr);
-    __gm__ half *vj_addr = reinterpret_cast<__gm__ half *>(vj->buffer.addr);
+    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
+    __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr);
     __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
 
-    // pij (M, K) fp16, vj (K, N) fp16 in ND (row-major), oi_new (M, N) fp32
-    using GlobalA = GlobalTensor<half, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
-    using GlobalB = GlobalTensor<half, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, N, 1>>;
+    // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, N, 1>>;
     using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M * N, M * N, M * N, N, 1>>;
 
     GlobalA pijGlobal(pij_addr + pij->start_offset);
@@ -47,12 +49,12 @@ static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __g
     GlobalOut oiGlobal(oi_addr + oi->start_offset);
 
     // L1 Mat tiles: standard ND pattern for both A and B
-    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
 
     // L0 tiles
-    using LeftTile = TileLeft<half, M, K, M, K>;
-    using RightTile = TileRight<half, K, N, K, N>;
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
     using AccTile = TileAcc<float, M, N, M, N>;
 
     TileMatA aMatTile;
@@ -67,15 +69,17 @@ static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __g
     TASSIGN(bTile, 0x0);
     TASSIGN(cTile, 0x0);
 
-    // Load pij and vj to L1
+    // Load pij and vj to L1 with separate events for pipeline overlap
     TLOAD(aMatTile, pijGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
     TLOAD(bMatTile, vjGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
 
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    // Move A to L0A as soon as A load completes (B may still be loading)
     wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-    // Move to L0A/L0B
     TMOV(aTile, aMatTile);
+    // Move B to L0B after B load completes
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
     TMOV(bTile, bMatTile);
 
     set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
@@ -97,6 +101,14 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
     __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]);
     __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]);
     __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
-
-    pv_matmul_impl<16, 16, 16>(pij, vj, oi_new);
+    uint64_t q_tile_size = static_cast<uint64_t>(pij->shapes[0]);
+    // args[4] = block_size, args[5] = head_dim
+
+    if (q_tile_size == 16 && pij->shapes[1] <= 16) {
+        pv_matmul_impl<16, 16, 16>(pij, vj, oi_new);
+    } else if (q_tile_size == 16) {
+        pv_matmul_impl<16, 128, 128>(pij, vj, oi_new);
+    } else {
+        pv_matmul_impl<64, 64, 128>(pij, vj, oi_new);
+    }
 }
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
index cb1de3e1e..c3e38f7d2 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -10,7 +10,9 @@
  */
 // QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N)
 //
-// Fixed tile size: (16, 16) @ (16, 16).T -> (16, 16)
+// Supports two tile configurations via runtime dispatch:
+//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
+//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
 //
 // kj is stored as (N, K) = (block_size, head_dim) in row-major memory.
 // This is equivalent to (K, N) in column-major (DN) layout.
@@ -33,14 +35,14 @@ using namespace pto;
 
 template <int M, int K, int N>
 static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) {
-    __gm__ half *qi_addr = reinterpret_cast<__gm__ half *>(qi->buffer.addr);
-    __gm__ half *kj_addr = reinterpret_cast<__gm__ half *>(kj->buffer.addr);
+    __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr);
+    __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr);
     __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
 
-    // qi (M, K) fp16 in ND (row-major) layout
-    using GlobalA = GlobalTensor<half, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
+    // qi (M, K) bf16 in ND (row-major) layout
+    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
     // kj stored as (N, K) row-major = (K, N) column-major -> DN layout
-    using GlobalB = GlobalTensor<half, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
+    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
     using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M * N, M * N, M * N, N, 1>>;
 
     GlobalA qiGlobal(qi_addr + qi->start_offset);
@@ -48,12 +50,12 @@ static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm
     GlobalOut sijGlobal(sij_addr + sij->start_offset);
 
     // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
-    using TileMatA = Tile<TileType::Mat, half, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, half, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
+    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
+    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
 
     // L0 tiles
-    using LeftTile = TileLeft<half, M, K, M, K>;
-    using RightTile = TileRight<half, K, N, K, N>;
+    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
+    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
     using AccTile = TileAcc<float, M, N, M, N>;
 
     TileMatA aMatTile;
@@ -68,15 +70,17 @@ static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm
     TASSIGN(bTile, 0x0);
     TASSIGN(cTile, 0x0);
 
-    // Load A and B to L1
+    // Load A and B to L1 with separate events for pipeline overlap
     TLOAD(aMatTile, qiGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
     TLOAD(bMatTile, kjGlobal);
+    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
 
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
+    // Move A to L0A as soon as A load completes (B may still be loading)
     wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-    // Move from L1 to L0A/L0B
     TMOV(aTile, aMatTile);
+    // Move B to L0B after B load completes
+    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
     TMOV(bTile, bMatTile);
 
     set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
@@ -98,6 +102,14 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
     __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]);
     __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]);
     __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]);
-
-    qk_matmul_impl<16, 16, 16>(qi, kj, sij);
+    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
+    // args[4] = head_dim (128), args[5] = block_size
+
+    if (q_tile_size == 16 && qi->shapes[1] <= 16) {
+        qk_matmul_impl<16, 16, 16>(qi, kj, sij);
+    } else if (q_tile_size == 16) {
+        qk_matmul_impl<16, 128, 128>(qi, kj, sij);
+    } else {
+        qk_matmul_impl<64, 128, 64>(qi, kj, sij);
+    }
 }
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
index d0b09a69b..cb841572c 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -10,13 +10,15 @@
  */
 // Online Softmax Update + Normalize Kernel (AIV)
 //
-// Fixed tile size: oi/oi_new are (16, 16), mij/lij/mi/li are 16-element vectors
+// Operates on full tiles where M=q_tile_size, N=head_dim (128):
+//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
+//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
 //
-// Scalar layout strategy:
-//   M scalar floats stored contiguously in GM can be loaded as either:
-//   - ND (kScalarRows, kScalarCols) RowMajor for element-wise ops (TMAX, TSUB, TEXP, TMUL, TADD)
-//   - DN (kAlignedRows, 1) ColMajor for row-broadcast ops (TROWEXPANDMUL, TROWEXPANDDIV)
-//   Conversion between layouts uses GM round-trip: ND TSTORE -> DN TLOAD.
+// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
+//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
+//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
+//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
+//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
 
 #include <cstdint>
 #include <pto/pto-inst.hpp>
@@ -46,11 +48,6 @@ static __aicore__ void online_update_impl(
     __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
     __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
 
-    // Scalar tile dimensions for RowMajor layout:
-    // kScalarCols = 32 bytes / 4 bytes per float = 8 floats per row (one 32-byte block)
-    // kScalarRows = M / 8 (M=16 -> 2 rows)
-    constexpr int kScalarCols = 32 / sizeof(float);
-    constexpr int kScalarRows = M / kScalarCols;
     // Aligned rows for ColMajor DN tiles (32-byte alignment)
     constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
 
@@ -59,77 +56,84 @@ static __aicore__ void online_update_impl(
     // Data (M, N) RowMajor
     using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
 
-    // Scalar ND: M contiguous floats as (kScalarRows, kScalarCols) RowMajor
+    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
+    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
+
+    // Scalar ND: for storing mi_new and li_new back to GM
+    constexpr int kScalarCols = 32 / sizeof(float);
+    constexpr int kScalarRows = M / kScalarCols;
     using GlobalScalarND =
         GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, pto::Stride<1, 1, 1, kScalarCols, 1>>;
 
-    // Scalar DN: same M contiguous floats as (kAlignedRows, 1) ColMajor
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
-
     // --- GlobalTensor instances ---
 
     GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
     GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
     GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
 
-    // ND globals for scalar element-wise operations
-    GlobalScalarND mijGlobalND(mij_ptr + mij->start_offset);
-    GlobalScalarND lijGlobalND(lij_ptr + lij->start_offset);
-    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
-    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
-
-    // DN globals aliased to same GM for ColMajor reload (used after ND TSTORE)
+    // DN globals for loading scalars as ColMajor
     GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
     GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
+    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
     GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
 
+    // ND globals for storing scalar results
+    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
+    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
+
     // --- Tile types ---
 
     using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
+    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
+
+    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
+    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
+
+    // ND tile for storing back to GM
     using TileScalarND =
         Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
 
     // --- UB memory layout ---
 
     constexpr int kDataBytes = M * N * sizeof(float);
-    constexpr int kScalarNDBytes = kScalarRows * kScalarCols * sizeof(float);
     constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
 
     // Data tiles
     TileDataMxN oiNewTile;
     TileDataMxN oiTile;
 
-    // Scalar ND tiles for element-wise arithmetic
-    TileScalarND mijND, lijND, miND, liND;
-    TileScalarND miNewND, alphaND, betaND, tmpND;
+    // Scalar DN tiles loaded from GM (ColMajor)
+    TileScalarDN mijDN, lijDN, miDN, liDN;
 
-    // Scalar DN tiles for TROWEXPAND operations
-    TileScalarDN alphaDN, betaDN, liDN;
+    // Temporary DN tiles for results
+    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
 
     TASSIGN(oiNewTile, 0);
     TASSIGN(oiTile, kDataBytes);
-    TASSIGN(mijND, 2 * kDataBytes);
-    TASSIGN(lijND, 2 * kDataBytes + kScalarNDBytes);
-    TASSIGN(miND, 2 * kDataBytes + 2 * kScalarNDBytes);
-    TASSIGN(liND, 2 * kDataBytes + 3 * kScalarNDBytes);
-    TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarNDBytes);
-    TASSIGN(alphaND, 2 * kDataBytes + 5 * kScalarNDBytes);
-    TASSIGN(betaND, 2 * kDataBytes + 6 * kScalarNDBytes);
-    TASSIGN(tmpND, 2 * kDataBytes + 7 * kScalarNDBytes);
-    TASSIGN(alphaDN, 2 * kDataBytes + 8 * kScalarNDBytes);
-    TASSIGN(betaDN, 2 * kDataBytes + 8 * kScalarNDBytes + kScalarDNBytes);
-    TASSIGN(liDN, 2 * kDataBytes + 8 * kScalarNDBytes + 2 * kScalarDNBytes);
+    TASSIGN(mijDN, 2 * kDataBytes);
+    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
+    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
+    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
+    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
+    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
+    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
+    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
+    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
 
     if (is_first) {
         // --- First block: copy inputs to accumulators ---
         TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(mijND, mijGlobalND);
-        TLOAD(lijND, lijGlobalND);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
         set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
         wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
 
-        // Passthrough to MTE3 (no V compute needed)
+        // Store mi = mij, li = lij, oi = oi_new
+        // Alias ND tiles to the same UB as DN tiles for storing as ND format
+        TileScalarND mijND, lijND;
+        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
+        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
+
         set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
         wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
         TSTORE(miGlobalND, mijND);    // mi = mij
@@ -138,13 +142,10 @@ static __aicore__ void online_update_impl(
 
         if (is_last) {
             // Single block: normalize dst = oi_new / lij
-            // lij stored to li buffer in ND format; reload as DN for TROWEXPANDDIV
-            set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-            TLOAD(liDN, liGlobalDN);
-            set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-            wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-            TROWEXPANDDIV(oiNewTile, oiNewTile, liDN);
+            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
+            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
+            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
             set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
             wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
             TSTORE(dstGlobal, oiNewTile);
@@ -152,64 +153,70 @@ static __aicore__ void online_update_impl(
     } else {
         // --- Subsequent blocks: accumulate ---
 
-        // Phase 1: Load all inputs
+        // Load all inputs
         TLOAD(oiNewTile, oiNewGlobal);
         TLOAD(oiTile, oiGlobal);
-        TLOAD(mijND, mijGlobalND);
-        TLOAD(lijND, lijGlobalND);
-        TLOAD(miND, miGlobalND);
-        TLOAD(liND, liGlobalND);
+        TLOAD(mijDN, mijGlobalDN);
+        TLOAD(lijDN, lijGlobalDN);
+        TLOAD(miDN, miGlobalDN);
+        TLOAD(liDN, liGlobalDN);
         set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
         wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
 
-        // Phase 2: Scalar arithmetic in RowMajor (kScalarRows, kScalarCols)
-        // pipe_barrier(PIPE_V) required between each dependent vector operation
-        // to resolve RAW hazards on shared UB tiles.
-        TMAX(miNewND, miND, mijND);    // mi_new = max(mi, mij)
-        TSUB(alphaND, miND, miNewND);  // alpha = mi - mi_new
-        TEXP(alphaND, alphaND);        // alpha = exp(mi - mi_new)
-        TSUB(betaND, mijND, miNewND);  // beta = mij - mi_new
-        TEXP(betaND, betaND);          // beta = exp(mij - mi_new)
-        TMUL(liND, alphaND, liND);     // li = alpha * li
-        TMUL(tmpND, betaND, lijND);    // tmp = beta * lij
-        TADD(liND, liND, tmpND);       // li = alpha * li + beta * lij (= li_new)
-
-        // Phase 3: Store scalar results to GM (ND format)
-        // mi_new -> mi accumulator, li_new -> li accumulator
-        // alpha -> mij buffer (reuse), beta -> lij buffer (reuse)
-        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        TSTORE(miGlobalND, miNewND);   // persist mi_new
-        TSTORE(liGlobalND, liND);      // persist li_new
-        TSTORE(mijGlobalND, alphaND);  // temp: alpha to mij buffer
-        TSTORE(lijGlobalND, betaND);   // temp: beta to lij buffer
-
-        // Phase 4: Reload alpha, beta (and li if last) as ColMajor DN
-        set_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        wait_flag(PIPE_MTE3, PIPE_MTE2, EVENT_ID0);
-        TLOAD(alphaDN, mijGlobalDN);  // alpha from mij buffer as DN
-        TLOAD(betaDN, lijGlobalDN);   // beta from lij buffer as DN
-        if (is_last) {
-            TLOAD(liDN, liGlobalDN);  // li_new from li buffer as DN
-        }
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID1);
-
-        // Phase 5: Scale data tiles using row-broadcast multiply
+        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
+        TileScalarRow miRow, mijRow, liRow, lijRow;
+        TRESHAPE(miRow, miDN);
+        TRESHAPE(mijRow, mijDN);
+        TRESHAPE(liRow, liDN);
+        TRESHAPE(lijRow, lijDN);
+
+        // Scalar arithmetic in RowMajor (1, M) layout
+        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
+        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
+        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
+        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
+        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
+
+        TMAX(miNewRow, miRow, mijRow);     // mi_new = max(mi, mij)
+        TSUB(alphaRow, miRow, miNewRow);   // alpha_exp = mi - mi_new
+        TEXP(alphaRow, alphaRow);          // alpha = exp(mi - mi_new)
+        TSUB(betaRow, mijRow, miNewRow);   // beta_exp = mij - mi_new
+        TEXP(betaRow, betaRow);            // beta = exp(mij - mi_new)
+        TMUL(tmpRow, alphaRow, liRow);     // alpha * li
+        TMUL(liNewRow, betaRow, lijRow);   // beta * lij
+        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
+
+        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
+        TRESHAPE(alphaDN, alphaRow);
+        TRESHAPE(betaDN, betaRow);
+
+        // Scale data tiles using row-broadcast multiply
         TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
         TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
         TADD(oiTile, oiTile, oiNewTile);              // oi = alpha*oi + beta*oi_new
 
+        // Store mi_new and li_new to GM (ND format)
+        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
+        TileScalarND miNewND, liNewND;
+        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
+        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
+
         if (is_last) {
-            // Phase 6: Normalize and output
-            TROWEXPANDDIV(oiTile, oiTile, liDN);  // dst = oi / li_new
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            // Normalize and output: dst = oi / li_new
+            TRESHAPE(liNewDN, liNewRow);
+            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
             TSTORE(dstGlobal, oiTile);
         } else {
-            // Phase 6: Store updated accumulators
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
+            // Store updated accumulators
+            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+            TSTORE(miGlobalND, miNewND);  // persist mi_new
+            TSTORE(liGlobalND, liNewND);  // persist li_new
             TSTORE(oiGlobal, oiTile);
         }
     }
@@ -228,6 +235,14 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
     __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]);
     uint64_t is_first = static_cast<uint64_t>(args[7]);
     uint64_t is_last = static_cast<uint64_t>(args[8]);
+    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
+    // args[10] = head_dim (128)
 
-    online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    if (q_tile_size == 16 && oi_new->shapes[1] <= 16) {
+        online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else if (q_tile_size == 16) {
+        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else {
+        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    }
 }
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index 7729bbbd8..4bb21f68b 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -10,14 +10,16 @@
  */
 // Softmax Preparation Kernel (AIV) with partial block masking
 //
-// Fixed tile size: sij is (16, 16)
+// Operates on (M, N) tile where M=q_tile_size, N=block_size:
+//   Case1: sij is (16, 128)
+//   Case2: sij is (64, 64)
 //
 // For partial blocks (valid_len < N), positions [valid_len, N) in sij are
-// filled with -inf before softmax, ensuring exp(-inf)=0 so that invalid
-// key positions contribute zero attention weight.
+// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0
+// so that invalid key positions contribute zero attention weight.
 //
 // Computes:
-//   sij_masked = pad(sij, valid_len, -inf)
+//   sij_masked = TFILLPAD(sij, valid_len, pad=-inf)
 //   sij_scale = sij_masked * scale
 //   mij = row_max(sij_scale)        -> (M, 1)
 //   pij = exp(sij_scale - mij)      -> (M, N)
@@ -26,9 +28,8 @@
 #include <cstdint>
 #include <pto/pto-inst.hpp>
 
-#include "tensor.h"  // NOLINT(build/include_subdir)
+#include "tensor.h"
 
-// NOLINTNEXTLINE(build/namespaces)
 using namespace pto;
 
 #ifndef __gm__
@@ -36,7 +37,7 @@ using namespace pto;
 #endif
 
 #ifndef __aicore__
-#define __aicore__ [aicore]  // NOLINT(whitespace/braces)
+#define __aicore__ [aicore]
 #endif
 
 template <int M, int N>
@@ -45,18 +46,18 @@ static __aicore__ void softmax_prepare_impl(
 ) {
     uint64_t valid_len = static_cast<uint64_t>(sij->shapes[1]);
     __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
-    __gm__ half *pij_addr = reinterpret_cast<__gm__ half *>(pij->buffer.addr);
+    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
     __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
     __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
 
     constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
 
     using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
-    using GlobalDataMxN_f16 = GlobalTensor<half, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
+    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
     using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
 
     GlobalDataMxN sijGlobal(sij_addr + sij->start_offset);
-    GlobalDataMxN_f16 pijGlobal(pij_addr + pij->start_offset);
+    GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset);
     GlobalScalarDN mijGlobal(mij_addr + mij->start_offset);
     GlobalScalarDN lijGlobal(lij_addr + lij->start_offset);
 
@@ -66,7 +67,7 @@ static __aicore__ void softmax_prepare_impl(
     using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
 
     using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
-    using TileVecMxN_f16 = Tile<TileType::Vec, half, M, N, BLayout::RowMajor, M, N>;
+    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
     using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
 
     TileVecMxN sijTile;
@@ -76,8 +77,9 @@ static __aicore__ void softmax_prepare_impl(
     TileVecMxN tmpTile;
     TileScalarDN maxTile;
     TileScalarDN sumTile;
-    TileVecMxN_f16 pijF16Tile;
+    TileVecMxN_bf16 pijBf16Tile;
 
+    // All sij tiles share UB address 0x0 (in-place masking)
     TASSIGN(sijTile, 0x0);
     TASSIGN(sijDynTile, 0x0);
     TASSIGN(sijPadTile, 0x0);
@@ -85,28 +87,38 @@ static __aicore__ void softmax_prepare_impl(
     TASSIGN(tmpTile, 2 * M * N * sizeof(float));
     TASSIGN(maxTile, 3 * M * N * sizeof(float));
     TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
-    TASSIGN(pijF16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
+    TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
 
     // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks
+    // printf("sij addr incore %x\n", sij->buffer.addr);
     TLOAD(sijTile, sijGlobal);
     set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
     wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
 
-    // manually fill invalid columns with -inf as a workaround.
+    // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary,
+    // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N.
     TFILLPAD_INPLACE(sijPadTile, sijDynTile);
 
     TMULS(sijTile, sijTile, scale_value);
     TROWMAX(maxTile, sijTile, tmpTile);
     TROWEXPANDSUB(pijTile, sijTile, maxTile);
     TEXP(pijTile, pijTile);
-    // Truncate pij to fp16 first, then compute lij from truncated values (matches golden)
-    TCVT(pijF16Tile, pijTile, RoundMode::CAST_ROUND);
-    TCVT(pijTile, pijF16Tile, RoundMode::CAST_ROUND);
+    // Truncate pij to bf16 first
+    TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);  // pij bf16 ready, can store early
+
+    // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel
+    TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
     TROWSUM(sumTile, pijTile, tmpTile);
+    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);  // sum ready
 
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    // Store pij (overlaps with TCVT + TROWSUM above)
     wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
+    TSTORE(pijGlobal, pijBf16Tile);
+
+    // Store max and sum
     TSTORE(mijGlobal, maxTile);
+    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
     TSTORE(lijGlobal, sumTile);
     TSTORE(pijGlobal, pijF16Tile);
 
@@ -119,7 +131,19 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
     __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]);
     __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]);
     __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]);
-    float scale_value = from_u64<float>(static_cast<uint64_t>(args[4]));
-
-    softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij);
+    union {
+        uint64_t u;
+        float f;
+    } scale_conv;
+    scale_conv.u = static_cast<uint64_t>(args[4]);
+    float scale_value = scale_conv.f;
+    uint64_t q_tile_size = static_cast<uint64_t>(sij->shapes[0]);
+
+    if (q_tile_size == 16 && pij->shapes[1] <= 16) {
+        softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij);
+    } else if (q_tile_size == 16) {
+        softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij);
+    } else {
+        softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij);
+    }
 }
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index 5a528eb49..b3314019a 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -15,18 +15,15 @@
  * Each block processes a single 16x16 matmul operation.
  *
  * Memory Layout:
- *   Query: (batch, 16, 16) - one 16x16 tile per batch fp16
- *   Key:   (total_blocks, 16, 16) - stored as K^T for direct matmul fp16
- *   Value: (total_blocks, 16, 16) - direct format fp16
- *
- * This file compiles as a standalone .so with zero runtime link dependencies.
- * All runtime calls go through the PTO2RuntimeOps function-pointer table.
+ *   Query: (batch, 16, 16) - one 16x16 tile per batch
+ *   Key:   (total_blocks, 16, 16) - stored as K^T for direct matmul
+ *   Value: (total_blocks, 16, 16) - direct format
  */
 
-#include <stddef.h>
-#include <stdint.h>
-
+#include <algorithm>
 #include <cinttypes>
+#include <cstdint>
+#include <cstring>
 
 #include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
 
@@ -34,6 +31,26 @@
 #define FUNC_SOFTMAX_PREPARE 1
 #define FUNC_PV_MATMUL 2
 #define FUNC_ONLINE_UPDATE 3
+constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000;  // 50 MHz
+
+inline double cycles_to_us(uint64_t cycles) {
+    return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
+}
+
+inline uint64_t get_sys_cnt_aicpu() {
+    uint64_t ticks;
+    asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
+    return ticks;
+}
+
+#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
+#define CYCLE_COUNT_LAP(acc)       \
+    do {                           \
+        _t1 = get_sys_cnt_aicpu(); \
+        acc += (_t1 - _t0);        \
+        _t0 = _t1;                 \
+    } while (0)
+
 extern "C" {
 
 __attribute__((visibility("default"))) PTO2OrchestrationConfig
@@ -44,27 +61,37 @@ aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
     };
 }
 
-__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
+__attribute__((visibility("default"))) void build_paged_attention_graph(const ChipStorageTaskArgs &orch_args) {
+    uint64_t prof_param_extract = 0;
+    uint64_t prof_ext_tensor = 0;
+    uint64_t prof_scope = 0;
+    uint64_t prof_make_tensor = 0;
+    uint64_t prof_tensor_view = 0;
+    uint64_t prof_param_setup = 0;
+    uint64_t prof_submit_task = 0;
+    int prof_submit_count = 0;
+    int prof_make_count = 0;
+    int prof_view_count = 0;
+
+    CYCLE_COUNT_START();
+
     // Read dimensions from tensor metadata
-    // query: shape=[batch, num_heads, head_dim]
     uint64_t batch = orch_args.tensor(0).shapes[0];
     uint64_t num_heads = orch_args.tensor(0).shapes[1];
     uint64_t head_dim = orch_args.tensor(0).shapes[2];
     DataType data_type = orch_args.tensor(0).dtype;
 
-    // key_cache: shape=[total_blocks, block_size, kv_head_num, head_dim]
     uint64_t block_size = orch_args.tensor(1).shapes[1];
-
-    // block_table: shape=[batch, max_num_blocks_per_req]
     uint64_t block_num = orch_args.tensor(3).shapes[1];
 
-    // scale from scalar arg
     uint64_t scale_value = orch_args.scalar(0);
 
     uint64_t q_head_num = num_heads;
-    uint64_t q_tile = 16;
+    uint64_t q_tile = std::min(num_heads, 128UL);
     uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
-    uint64_t elem_size = get_element_size(data_type);
+    CYCLE_COUNT_LAP(prof_param_extract);
+
+    LOG_ALWAYS(">>>>>> batch = %" PRIu64, batch);
 
     // Reshape tensors for kernel consumption (2D flattened)
     void *query_ptr = orch_args.tensor(0).data_as<void>();
@@ -72,22 +99,21 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
     void *vc_ptr = orch_args.tensor(2).data_as<void>();
     void *out_ptr = orch_args.tensor(5).data_as<void>();
 
-    // Compute kv_total_rows from key_cache tensor metadata
     uint64_t total_blocks_count = orch_args.tensor(1).shapes[0];
-    uint64_t kv_total_rows = total_blocks_count * block_size;
 
     uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
-    uint32_t key_cache_shapes[2] = {static_cast<uint32_t>(kv_total_rows), static_cast<uint32_t>(head_dim)};
-    uint32_t value_cache_shapes[2] = {static_cast<uint32_t>(kv_total_rows), static_cast<uint32_t>(head_dim)};
+    uint32_t key_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
+    uint32_t value_cache_shapes[2] = {
+        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
+    };
     uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
     Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type);
     Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type);
     Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type);
     Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
-    LOG_DEBUG("query=%s", query.dump().c_str());
-    LOG_DEBUG("key_cache=%s", key_cache.dump().c_str());
-    LOG_DEBUG("value_cache=%s", value_cache.dump().c_str());
-    LOG_DEBUG("out=%s", out.dump().c_str());
+    CYCLE_COUNT_LAP(prof_ext_tensor);
 
     uint32_t bt_shapes[2] = {static_cast<uint32_t>(batch), static_cast<uint32_t>(block_num)};
     Tensor block_table =
@@ -105,64 +131,93 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
     TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32);
     TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type);
 
+    prof_make_count += 4;
+    CYCLE_COUNT_LAP(prof_make_tensor);
+
+    int total_tasks = 0;
+
     for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
         uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
         uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
         uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
         for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
             PTO2_SCOPE() {
-                uint32_t cur_offset = static_cast<uint32_t>(b_idx * q_head_num + q_idx * q_tile);
+                CYCLE_COUNT_LAP(prof_scope);
+                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
 
-                uint32_t qi_offsets[2] = {cur_offset, 0};
+                uint32_t qi_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
                 Tensor qi = query.view(tile2d_shapes, qi_offsets);
-                uint32_t out_view_offsets[2] = {cur_offset, 0};
+                uint32_t out_view_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
                 Tensor out_view = out.view(tile2d_shapes, out_view_offsets);
+                prof_view_count += 2;
+                CYCLE_COUNT_LAP(prof_tensor_view);
 
+                CYCLE_COUNT_LAP(prof_param_setup);
                 TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci);
                 const Tensor &oi = alloc_outs.get_ref(0);
                 const Tensor &li_update = alloc_outs.get_ref(1);
                 const Tensor &mi_update = alloc_outs.get_ref(2);
+                prof_submit_count++;
+                CYCLE_COUNT_LAP(prof_submit_task);
 
                 for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
+                    PTO2_SCOPE_GUARD();
+
                     uint32_t bt_idx[2] = {static_cast<uint32_t>(b_idx), static_cast<uint32_t>(bn)};
                     uint64_t cur_block_idx = static_cast<uint64_t>(get_tensor_data<int32_t>(block_table, 2, bt_idx));
-                    uint64_t valid_len =
-                        block_size < (cur_seq - bn * block_size) ? block_size : (cur_seq - bn * block_size);
+                    uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size);
+                    CYCLE_COUNT_LAP(prof_param_extract);
+
                     uint32_t kv_shapes[2] = {static_cast<uint32_t>(block_size), static_cast<uint32_t>(head_dim)};
                     uint32_t kv_offsets[2] = {static_cast<uint32_t>(cur_block_idx * block_size), 0};
                     Tensor kj = key_cache.view(kv_shapes, kv_offsets);
                     Tensor vj = value_cache.view(kv_shapes, kv_offsets);
+                    prof_view_count += 2;
+                    CYCLE_COUNT_LAP(prof_tensor_view);
 
                     Arg params_qk;
                     params_qk.add_input(qi);
                     params_qk.add_input(kj);
                     params_qk.add_output(sij_ci);
+                    CYCLE_COUNT_LAP(prof_param_setup);
                     TaskOutputTensors qk_outs = pto2_rt_submit_aic_task(FUNC_QK_MATMUL, params_qk);
                     const Tensor &sij = qk_outs.get_ref(0);
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
 
                     uint32_t sij_valid_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(valid_len)};
                     uint32_t sij_valid_offsets[2] = {0, 0};
                     Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets);
+                    prof_view_count += 1;
+                    CYCLE_COUNT_LAP(prof_tensor_view);
+
                     Arg params_sf;
                     params_sf.add_input(sij_valid);
                     params_sf.add_output(pij_f16_ci);
                     params_sf.add_output(scalar_ci);
                     params_sf.add_output(scalar_ci);
                     params_sf.add_scalar(scale_value);
+                    CYCLE_COUNT_LAP(prof_param_setup);
                     TaskOutputTensors sf_outs = pto2_rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf);
                     const Tensor &pij_f16 = sf_outs.get_ref(0);
                     const Tensor &mi = sf_outs.get_ref(1);
                     const Tensor &li = sf_outs.get_ref(2);
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
 
                     Arg params_pv;
                     params_pv.add_input(pij_f16);
                     params_pv.add_input(vj);
                     params_pv.add_output(tile2d_ci);
+                    CYCLE_COUNT_LAP(prof_param_setup);
                     TaskOutputTensors pv_outs = pto2_rt_submit_aic_task(FUNC_PV_MATMUL, params_pv);
                     const Tensor &oi_tmp = pv_outs.get_ref(0);
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
 
                     uint64_t is_first = (bn == 0) ? 1 : 0;
                     uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
+                    CYCLE_COUNT_LAP(prof_param_extract);
 
                     Arg params_up;
                     params_up.add_input(mi);
@@ -174,13 +229,53 @@ __attribute__((visibility("default"))) void aicpu_orchestration_entry(const Chip
                     params_up.add_inout(out_view);
                     params_up.add_scalar(is_first);
                     params_up.add_scalar(is_last);
+                    CYCLE_COUNT_LAP(prof_param_setup);
                     pto2_rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up);
+                    prof_submit_count++;
+                    CYCLE_COUNT_LAP(prof_submit_task);
                 }
             }
+            CYCLE_COUNT_LAP(prof_scope);
         }
     }
 
-    LOG_INFO("tasks submitted for batch=%" PRIu64 ", num_heads=%" PRIu64, batch, num_heads);
+    uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
+                     prof_submit_task + prof_scope;
+    LOG_ALWAYS(
+        "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count,
+        prof_make_count, prof_view_count, cycles_to_us(total)
+    );
+    if (total > 0) {
+        LOG_ALWAYS(
+            "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract),
+            prof_param_extract * 100.0 / total
+        );
+        LOG_ALWAYS(
+            "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total
+        );
+        LOG_ALWAYS(
+            "  create_info(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor),
+            prof_make_tensor * 100.0 / total,
+            prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0
+        );
+        LOG_ALWAYS(
+            "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view),
+            prof_tensor_view * 100.0 / total,
+            prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0
+        );
+        LOG_ALWAYS(
+            "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total
+        );
+        LOG_ALWAYS("  scope            : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total);
+        LOG_ALWAYS(
+            "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task),
+            prof_submit_task * 100.0 / total,
+            prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0
+        );
+    }
+
+#undef CYCLE_COUNT_START
+#undef CYCLE_COUNT_LAP
 }
 
 }  // extern "C"
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
index 2e6eb99fb..a877c3ab2 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
@@ -7,10 +7,10 @@
 # INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
 # See LICENSE in the root of the software repository for the full text of the License.
 # -----------------------------------------------------------------------------------------------------------
-"""Paged attention — tensormap_and_ringbuffer example (small scale, float16).
+"""Paged attention — tensormap_and_ringbuffer test (production scale, bfloat16).
 
 AIC+AIV mixed execution with online softmax paged attention.
-Small-scale cases including variable sequence lengths.
+Production-scale cases for A5 hardware validation.
 """
 
 import torch
@@ -25,13 +25,13 @@
 class TestPagedAttention(SceneTestCase):
     """Paged attention with tensormap_and_ringbuffer runtime on A5."""
 
-    RTOL = 1e-2
-    ATOL = 1e-2
+    RTOL = 1e-3
+    ATOL = 1e-3
 
     CALLABLE = {
         "orchestration": {
             "source": "kernels/orchestration/paged_attention_orch.cpp",
-            "function_name": "aicpu_orchestration_entry",
+            "function_name": "build_paged_attention_graph",
             "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
         },
         "incores": [
@@ -65,6 +65,53 @@ class TestPagedAttention(SceneTestCase):
     CASES = [
         {
             "name": "Case1",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "params": {
+                "batch": 256,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 128,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case2",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 128,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "Case3",
+            "platforms": ["a5"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 24},
+            "manual": True,
+            "params": {
+                "batch": 64,
+                "num_heads": 64,
+                "kv_head_num": 1,
+                "head_dim": 256,
+                "block_size": 64,
+                "context_len": 8192,
+                "max_model_len": 32768,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "SmallCase1",
             "platforms": ["a5sim", "a5"],
             "config": {"aicpu_thread_num": 4, "block_dim": 24},
             "params": {
@@ -75,11 +122,11 @@ class TestPagedAttention(SceneTestCase):
                 "block_size": 16,
                 "context_len": 33,
                 "max_model_len": 256,
-                "dtype": "float16",
+                "dtype": "bfloat16",
             },
         },
         {
-            "name": "Case2",
+            "name": "SmallCase2",
             "platforms": ["a5sim", "a5"],
             "config": {"aicpu_thread_num": 4, "block_dim": 24},
             "manual": True,
@@ -91,11 +138,11 @@ class TestPagedAttention(SceneTestCase):
                 "block_size": 16,
                 "context_len": 128,
                 "max_model_len": 256,
-                "dtype": "float16",
+                "dtype": "bfloat16",
             },
         },
         {
-            "name": "CaseVarSeq2",
+            "name": "SmallCaseVarSeq2",
             "platforms": ["a5sim", "a5"],
             "config": {"aicpu_thread_num": 4, "block_dim": 24},
             "manual": True,
@@ -108,11 +155,11 @@ class TestPagedAttention(SceneTestCase):
                 "context_len": 33,
                 "context_lens_list": [33, 17],
                 "max_model_len": 256,
-                "dtype": "float16",
+                "dtype": "bfloat16",
             },
         },
         {
-            "name": "CaseVarSeq4",
+            "name": "SmallCaseVarSeq4",
             "platforms": ["a5sim", "a5"],
             "config": {"aicpu_thread_num": 4, "block_dim": 24},
             "manual": True,
@@ -125,7 +172,7 @@ class TestPagedAttention(SceneTestCase):
                 "context_len": 128,
                 "context_lens_list": [33, 64, 128, 15],
                 "max_model_len": 256,
-                "dtype": "float16",
+                "dtype": "bfloat16",
             },
         },
     ]
diff --git a/tests/st/a5/host_build_graph/paged_attention/README.md b/tests/st/a5/host_build_graph/paged_attention/README.md
index bb280c331..c6c7a56a3 100644
--- a/tests/st/a5/host_build_graph/paged_attention/README.md
+++ b/tests/st/a5/host_build_graph/paged_attention/README.md
@@ -1,4 +1,4 @@
-# Paged Attention (Device Test)
+# Paged Attention (A5 host_build_graph)
 
 This example demonstrates Paged Attention implementation using CCE (Cube Core Engine) code generation, with AIC matmul kernels and AIV vector kernels using PTO Tile API.
 
@@ -13,16 +13,18 @@ Paged Attention is an efficient attention mechanism that processes KV cache in f
 ### Supported Platforms
 
 | Platform | Description |
-|----------|-------------|
-| a2a3 | Ascend hardware (requires device ID) |
+| -------- | ----------- |
+| a5sim | Simulator |
+| a5 | Ascend hardware |
 
-> This test uses bfloat16 data types and production-scale shapes that are not supported by the a2a3sim simulator. It only runs on real hardware.
+This directory contains the `host_build_graph` variant of the A5 paged attention scene test.
+The `tensormap_and_ringbuffer` variant lives separately under `examples/a5/tensormap_and_ringbuffer/paged_attention/`.
 
 ### Algorithm
 
 For each query token, the attention is computed incrementally across KV cache blocks:
 
-```
+```text
 For each block j:
     sij = Qi @ Kj^T                    # QK MatMul (AIC)
     mij, lij, pij = softmax_prepare(sij)  # Softmax (AIV)
@@ -33,7 +35,7 @@ For each block j:
 ### Kernel Design (AIC/AIV Split)
 
 | Kernel | Core Type | Operation | Key Instructions |
-|--------|-----------|-----------|------------------|
+| ------ | --------- | --------- | ---------------- |
 | aic_qk_matmul | AIC (Cube) | Q @ K^T | TLOAD/TMOV/TMATMUL/TSTORE |
 | aiv_softmax_prepare | AIV (Vector) | scale, rowmax, exp, rowsum | TMULS/TROWMAX/TROWEXPANDSUB/TEXP/TROWSUM |
 | aic_pv_matmul | AIC (Cube) | P @ V | TLOAD/TMOV/TMATMUL/TSTORE |
@@ -41,7 +43,7 @@ For each block j:
 
 ### Memory Hierarchy (AIC Matmul)
 
-```
+```text
 GM -> L1 (Mat tiles) -> L0A/L0B -> L0C (Accumulator) -> GM
 ```
 
@@ -49,7 +51,7 @@ GM -> L1 (Mat tiles) -> L0A/L0B -> L0C (Accumulator) -> GM
 
 For each batch, the task dependency pattern is:
 
-```
+```text
 Block 0: QK -> SF -> PV --+
 Block 1: QK -> SF -> PV --+--> UP[0] -> UP[1] -> ... -> UP[n]
 Block n: QK -> SF -> PV --+
@@ -61,45 +63,40 @@ Block n: QK -> SF -> PV --+
 ## Quick Start
 
 ```bash
-# Run on hardware (specify device ID)
-python examples/scripts/run_example.py \
-  -k tests/st/host_build_graph/paged_attention/kernels \
-  -g tests/st/host_build_graph/paged_attention/golden.py \
-  -p a2a3 -d 0
-
-# Run multi-block test case
-PA_CASE=Case2 python examples/scripts/run_example.py \
-  -k tests/st/host_build_graph/paged_attention/kernels \
-  -g tests/st/host_build_graph/paged_attention/golden.py \
-  -p a2a3 -d 0
+# Run the default case on sim
+python tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py -p a5sim
+
+# Run a specific hardware case
+python tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py -p a5 -d 0 -k Case2
 ```
 
 ## Directory Structure
 
-```
+```text
 paged_attention/
 ├── README.md                    # This file
-├── golden.py                    # Input generation and expected output
+├── test_paged_attention.py      # Scene test entry
 └── kernels/
-    ├── kernel_config.py         # Kernel registration config
-    ├── aic/                      # AIC kernels (CCE codegen style)
-    │   ├── aic_qk_matmul.cpp     # Q @ K^T matmul
-    │   └── aic_pv_matmul.cpp     # P @ V matmul
-    ├── aiv/                      # AIV kernels (PTO Tile API)
-    │   ├── aiv_softmax_prepare.cpp  # Softmax preparation
-    │   └── aiv_online_update.cpp    # Online Softmax update + normalize
+    ├── aic/
+    │   ├── aic_qk_matmul.cpp
+    │   └── aic_pv_matmul.cpp
+    ├── aiv/
+    │   ├── aiv_softmax_prepare.cpp
+    │   └── aiv_online_update.cpp
     └── orchestration/
-        └── paged_attention_orch.cpp # Task graph builder
+        └── paged_attention_orch.cpp
 ```
 
 ## Test Cases
 
-| Case | batch | num_heads | kv_head_num | head_dim | block_size | context_len | Description |
-|------|-------|-----------|-------------|----------|------------|-------------|-------------|
-| Case1 | 1 | 16 | 1 | 128 | 128 | 256 | Small scale (default) |
-| Case2 | 8 | 64 | 1 | 128 | 64 | 8192 | Production scale |
+| Case | batch | num_heads | kv_head_num | head_dim | block_size | context_len | Platforms |
+| ---- | ----- | --------- | ----------- | -------- | ---------- | ----------- | --------- |
+| Case1 | 256 | 16 | 1 | 128 | 128 | 8100 | a5 |
+| Case2 | 64 | 64 | 1 | 128 | 64 | 8150 | a5 |
+| SmallCase1 | 1 | 16 | 1 | 16 | 16 | 16 | a5sim, a5 |
+| SmallCase2 | 1 | 16 | 1 | 16 | 16 | 64 | a5sim, a5 |
 
-All test cases use **bfloat16** Q/K/V inputs with GQA (kv_head_num=1).
+All cases use **bfloat16** Q/K/V inputs with GQA (`kv_head_num=1`).
 
 ## Key Technical Details
 
@@ -161,16 +158,16 @@ TROWEXPANDMUL(oiTile, oiTile, alphaTileDN);
 
 ## Expected Output
 
-```
+```text
 === Compiling and Registering Kernels ===
 Compiling kernel: .../aic_qk_matmul.cpp (func_id=0)
 Compiling kernel: .../aiv_softmax_prepare.cpp (func_id=1)
 Compiling kernel: .../aic_pv_matmul.cpp (func_id=2)
 Compiling kernel: .../aiv_online_update.cpp (func_id=3)
 ...
-=== build_paged_attention_graph (16x16 framework version) ===
+=== build_paged_attention_graph ===
 batch=1, num_heads=16, kv_head_num=1, head_dim=16
-block_size=16, block_num=1
+block_size=16, max_num_blocks=16
 ...
 Created 4 tasks
 ...
@@ -185,7 +182,7 @@ TEST PASSED
 
 ## Reference
 
-This implementation uses the Online Softmax algorithm for paged attention, with identical kernel structure to the PyPTO reference implementation.
+This implementation uses the Online Softmax algorithm for paged attention, with an AIC/AIV split tailored for the `host_build_graph` runtime on A5.
 
 ## See Also
 
diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
index 8d708928c..74584d6a1 100644
--- a/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
+++ b/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_pv_matmul.cpp
@@ -98,9 +98,12 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
     __gm__ uint8_t *vj = reinterpret_cast<__gm__ uint8_t *>(args[1]);
     __gm__ uint8_t *oi_new = reinterpret_cast<__gm__ uint8_t *>(args[2]);
     int q_tile_size = static_cast<int>(args[3]);
-    // args[4] = block_size, args[5] = head_dim
+    int block_size = static_cast<int>(args[4]);
+    // args[5] = head_dim
 
-    if (q_tile_size == 16) {
+    if (q_tile_size == 16 && block_size <= 16) {
+        pv_matmul_impl<16, 16, 16>(pij, vj, oi_new);
+    } else if (q_tile_size == 16) {
         pv_matmul_impl<16, 128, 128>(pij, vj, oi_new);
     } else {
         pv_matmul_impl<64, 64, 128>(pij, vj, oi_new);
diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
index abbf6537c..27c524a32 100644
--- a/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
+++ b/tests/st/a5/host_build_graph/paged_attention/kernels/aic/aic_qk_matmul.cpp
@@ -99,9 +99,12 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
     __gm__ uint8_t *kj = reinterpret_cast<__gm__ uint8_t *>(args[1]);
     __gm__ uint8_t *sij = reinterpret_cast<__gm__ uint8_t *>(args[2]);
     int q_tile_size = static_cast<int>(args[3]);
-    // args[4] = head_dim (128), args[5] = block_size
+    int head_dim = static_cast<int>(args[4]);
+    // args[5] = block_size
 
-    if (q_tile_size == 16) {
+    if (q_tile_size == 16 && head_dim <= 16) {
+        qk_matmul_impl<16, 16, 16>(qi, kj, sij);
+    } else if (q_tile_size == 16) {
         qk_matmul_impl<16, 128, 128>(qi, kj, sij);
     } else {
         qk_matmul_impl<64, 128, 64>(qi, kj, sij);
diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
index fbc55c324..965dffbbc 100644
--- a/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
+++ b/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_online_update.cpp
@@ -229,9 +229,11 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
     int is_last = static_cast<int>(args[7]);
     __gm__ uint8_t *dst = reinterpret_cast<__gm__ uint8_t *>(args[8]);
     int q_tile_size = static_cast<int>(args[9]);
-    // args[10] = head_dim (128)
+    int head_dim = static_cast<int>(args[10]);
 
-    if (q_tile_size == 16) {
+    if (q_tile_size == 16 && head_dim <= 16) {
+        online_update_impl<16, 16>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
+    } else if (q_tile_size == 16) {
         online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
     } else {
         online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
diff --git a/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index 0e87b525d..51a3315c9 100644
--- a/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/tests/st/a5/host_build_graph/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -127,10 +127,12 @@ extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
     __gm__ uint8_t *mij = reinterpret_cast<__gm__ uint8_t *>(args[3]);
     __gm__ uint8_t *lij = reinterpret_cast<__gm__ uint8_t *>(args[4]);
     int q_tile_size = static_cast<int>(args[5]);
-    // args[6] = block_size
+    int block_size = static_cast<int>(args[6]);
     int valid_len = static_cast<int>(args[7]);
 
-    if (q_tile_size == 16) {
+    if (q_tile_size == 16 && block_size <= 16) {
+        softmax_prepare_impl<16, 16>(sij, scale_value, pij, mij, lij, valid_len);
+    } else if (q_tile_size == 16) {
         softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij, valid_len);
     } else {
         softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij, valid_len);
diff --git a/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py
index 2d3b12d3b..143092ce5 100644
--- a/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py
+++ b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py
@@ -94,6 +94,37 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase):
                 "dtype": "bfloat16",
             },
         },
+        {
+            "name": "SmallCase1",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 16,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
+        {
+            "name": "SmallCase2",
+            "platforms": ["a5sim", "a5"],
+            "config": {"aicpu_thread_num": 3, "block_dim": 3},
+            "manual": True,
+            "params": {
+                "batch": 1,
+                "num_heads": 16,
+                "kv_head_num": 1,
+                "head_dim": 16,
+                "block_size": 16,
+                "context_len": 64,
+                "max_model_len": 256,
+                "dtype": "bfloat16",
+            },
+        },
     ]
 
     def generate_args(self, params):
diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aic/kernel_matmul.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_add_standalone.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/aiv/kernel_mul_standalone.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/kernels/orchestration/mixed_orch.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
rename to tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
deleted file mode 100644
index 5bca56442..000000000
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_pv_matmul.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// PV Matmul Kernel: pij(M, K) @ vj(K, N) -> oi_new(M, N)
-//
-// Supports two tile configurations via runtime dispatch:
-//   Case1: (16, 128) @ (128, 128) -> (16, 128)
-//   Case2: (64,  64) @ ( 64, 128) -> (64, 128)
-//
-// pij is bfloat16 (converted from fp32 in softmax_prepare via TCVT).
-// vj is stored as (K, N) = (block_size, head_dim) in row-major (ND) layout.
-// Standard non-transposed B pattern: ND GlobalB + ColMajor/RowMajor TileMatB.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int K, int N>
-static __aicore__ void pv_matmul_impl(__gm__ Tensor *pij, __gm__ Tensor *vj, __gm__ Tensor *oi) {
-    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
-    __gm__ bfloat16_t *vj_addr = reinterpret_cast<__gm__ bfloat16_t *>(vj->buffer.addr);
-    __gm__ float *oi_addr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
-
-    // pij (M, K) bf16, vj (K, N) bf16 in ND (row-major), oi_new (M, N) fp32
-    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
-    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, N, 1>>;
-    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M * N, M * N, M * N, N, 1>>;
-
-    GlobalA pijGlobal(pij_addr + pij->start_offset);
-    GlobalB vjGlobal(vj_addr + vj->start_offset);
-    GlobalOut oiGlobal(oi_addr + oi->start_offset);
-
-    // L1 Mat tiles: standard ND pattern for both A and B
-    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::ColMajor, K, N, SLayout::RowMajor, 512>;
-
-    // L0 tiles
-    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
-    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
-    using AccTile = TileAcc<float, M, N, M, N>;
-
-    TileMatA aMatTile;
-    TileMatB bMatTile;
-    TASSIGN(aMatTile, 0x0);
-    TASSIGN(bMatTile, 0x20000);
-
-    LeftTile aTile;
-    RightTile bTile;
-    AccTile cTile;
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x0);
-    TASSIGN(cTile, 0x0);
-
-    // Load pij and vj to L1 with separate events for pipeline overlap
-    TLOAD(aMatTile, pijGlobal);
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
-    TLOAD(bMatTile, vjGlobal);
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
-
-    // Move A to L0A as soon as A load completes (B may still be loading)
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-    TMOV(aTile, aMatTile);
-    // Move B to L0B after B load completes
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
-    TMOV(bTile, bMatTile);
-
-    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-    // Single matmul: (M,K) x (K,N) -> (M,N)
-    TMATMUL(cTile, aTile, bTile);
-
-    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-    TSTORE(oiGlobal, cTile);
-
-    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *vj = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    uint64_t q_tile_size = static_cast<uint64_t>(pij->shapes[0]);
-    // args[4] = block_size, args[5] = head_dim
-
-    if (q_tile_size == 16) {
-        pv_matmul_impl<16, 128, 128>(pij, vj, oi_new);
-    } else {
-        pv_matmul_impl<64, 64, 128>(pij, vj, oi_new);
-    }
-}
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
deleted file mode 100644
index 0bfa9c460..000000000
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aic/aic_qk_matmul.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// QK Matmul Kernel: qi(M, K) @ kj.T(K, N) -> sij(M, N)
-//
-// Supports two tile configurations via runtime dispatch:
-//   Case1: (16, 128) @ (128, 128).T -> (16, 128)
-//   Case2: (64, 128) @ (128,  64).T -> (64,  64)
-//
-// kj is stored as (N, K) = (block_size, head_dim) in row-major memory.
-// This is equivalent to (K, N) in column-major (DN) layout.
-// Using DN GlobalB + RowMajor/ColMajor TileMatB to handle the transposed B pattern.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int K, int N>
-static __aicore__ void qk_matmul_impl(__gm__ Tensor *qi, __gm__ Tensor *kj, __gm__ Tensor *sij) {
-    __gm__ bfloat16_t *qi_addr = reinterpret_cast<__gm__ bfloat16_t *>(qi->buffer.addr);
-    __gm__ bfloat16_t *kj_addr = reinterpret_cast<__gm__ bfloat16_t *>(kj->buffer.addr);
-    __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
-
-    // qi (M, K) bf16 in ND (row-major) layout
-    using GlobalA = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, K>, pto::Stride<M * K, M * K, M * K, K, 1>>;
-    // kj stored as (N, K) row-major = (K, N) column-major -> DN layout
-    using GlobalB = GlobalTensor<bfloat16_t, Shape<1, 1, 1, K, N>, pto::Stride<K * N, K * N, K * N, 1, K>, Layout::DN>;
-    using GlobalOut = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<M * N, M * N, M * N, N, 1>>;
-
-    GlobalA qiGlobal(qi_addr + qi->start_offset);
-    GlobalB kjGlobal(kj_addr + kj->start_offset);
-    GlobalOut sijGlobal(sij_addr + sij->start_offset);
-
-    // L1 Mat tiles: A is standard ND, B uses transposed-B pattern (RowMajor/ColMajor)
-    using TileMatA = Tile<TileType::Mat, bfloat16_t, M, K, BLayout::ColMajor, M, K, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, bfloat16_t, K, N, BLayout::RowMajor, K, N, SLayout::ColMajor, 512>;
-
-    // L0 tiles
-    using LeftTile = TileLeft<bfloat16_t, M, K, M, K>;
-    using RightTile = TileRight<bfloat16_t, K, N, K, N>;
-    using AccTile = TileAcc<float, M, N, M, N>;
-
-    TileMatA aMatTile;
-    TileMatB bMatTile;
-    TASSIGN(aMatTile, 0x0);
-    TASSIGN(bMatTile, 0x20000);
-
-    LeftTile aTile;
-    RightTile bTile;
-    AccTile cTile;
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x0);
-    TASSIGN(cTile, 0x0);
-
-    // Load A and B to L1 with separate events for pipeline overlap
-    TLOAD(aMatTile, qiGlobal);
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);  // A load done
-    TLOAD(bMatTile, kjGlobal);
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);  // B load done
-
-    // Move A to L0A as soon as A load completes (B may still be loading)
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-    TMOV(aTile, aMatTile);
-    // Move B to L0B after B load completes
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID1);
-    TMOV(bTile, bMatTile);
-
-    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-    // Matmul
-    TMATMUL(cTile, aTile, bTile);
-
-    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-    TSTORE(sijGlobal, cTile);
-
-    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *qi = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *kj = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    uint64_t q_tile_size = static_cast<uint64_t>(qi->shapes[0]);
-    // args[4] = head_dim (128), args[5] = block_size
-
-    if (q_tile_size == 16) {
-        qk_matmul_impl<16, 128, 128>(qi, kj, sij);
-    } else {
-        qk_matmul_impl<64, 128, 64>(qi, kj, sij);
-    }
-}
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
deleted file mode 100644
index a7ffed408..000000000
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_online_update.cpp
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// Online Softmax Update + Normalize Kernel (AIV)
-//
-// Operates on full tiles where M=q_tile_size, N=head_dim (128):
-//   Case1: oi/oi_new are (16, 128), mij/lij/mi/li are 16-element vectors
-//   Case2: oi/oi_new are (64, 128), mij/lij/mi/li are 64-element vectors
-//
-// Scalar layout strategy using TRESHAPE (zero-copy UB reshape):
-//   Scalars loaded as DN ColMajor (M, 1) for TROWEXPANDMUL/TROWEXPANDDIV.
-//   For element-wise ops (TMAX, TSUB, TEXP, etc.), TRESHAPE to RowMajor (1, M).
-//   After arithmetic, TRESHAPE back to ColMajor (M, 1) for row-broadcast ops.
-//   This eliminates the GM round-trip (TSTORE ND → TLOAD DN) used in the original.
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int N>
-static __aicore__ void online_update_impl(
-    __gm__ Tensor *mij, __gm__ Tensor *lij, __gm__ Tensor *oi_new, __gm__ Tensor *mi, __gm__ Tensor *li,
-    __gm__ Tensor *oi, uint64_t is_first, uint64_t is_last, __gm__ Tensor *dst
-) {
-    __gm__ float *mij_ptr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
-    __gm__ float *lij_ptr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
-    __gm__ float *oi_new_ptr = reinterpret_cast<__gm__ float *>(oi_new->buffer.addr);
-    __gm__ float *mi_ptr = reinterpret_cast<__gm__ float *>(mi->buffer.addr);
-    __gm__ float *li_ptr = reinterpret_cast<__gm__ float *>(li->buffer.addr);
-    __gm__ float *oi_ptr = reinterpret_cast<__gm__ float *>(oi->buffer.addr);
-    __gm__ float *dst_ptr = reinterpret_cast<__gm__ float *>(dst->buffer.addr);
-
-    // Aligned rows for ColMajor DN tiles (32-byte alignment)
-    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
-
-    // --- GlobalTensor types ---
-
-    // Data (M, N) RowMajor
-    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
-
-    // Scalar DN: M contiguous floats as (kAlignedRows, 1) ColMajor for TROWEXPAND ops and loading
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
-
-    // Scalar ND: for storing mi_new and li_new back to GM
-    constexpr int kScalarCols = 32 / sizeof(float);
-    constexpr int kScalarRows = M / kScalarCols;
-    using GlobalScalarND =
-        GlobalTensor<float, Shape<1, 1, 1, kScalarRows, kScalarCols>, pto::Stride<1, 1, 1, kScalarCols, 1>>;
-
-    // --- GlobalTensor instances ---
-
-    GlobalDataMxN oiNewGlobal(oi_new_ptr + oi_new->start_offset);
-    GlobalDataMxN oiGlobal(oi_ptr + oi->start_offset);
-    GlobalDataMxN dstGlobal(dst_ptr + dst->start_offset);
-
-    // DN globals for loading scalars as ColMajor
-    GlobalScalarDN mijGlobalDN(mij_ptr + mij->start_offset);
-    GlobalScalarDN lijGlobalDN(lij_ptr + lij->start_offset);
-    GlobalScalarDN miGlobalDN(mi_ptr + mi->start_offset);
-    GlobalScalarDN liGlobalDN(li_ptr + li->start_offset);
-
-    // ND globals for storing scalar results
-    GlobalScalarND miGlobalND(mi_ptr + mi->start_offset);
-    GlobalScalarND liGlobalND(li_ptr + li->start_offset);
-
-    // --- Tile types ---
-
-    using TileDataMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
-
-    // RowMajor (1, M) tiles for element-wise arithmetic via TRESHAPE
-    using TileScalarRow = Tile<TileType::Vec, float, 1, M, BLayout::RowMajor, 1, M>;
-
-    // ND tile for storing back to GM
-    using TileScalarND =
-        Tile<TileType::Vec, float, kScalarRows, kScalarCols, BLayout::RowMajor, kScalarRows, kScalarCols>;
-
-    // --- UB memory layout ---
-
-    constexpr int kDataBytes = M * N * sizeof(float);
-    constexpr int kScalarDNBytes = kAlignedRows * sizeof(float);
-
-    // Data tiles
-    TileDataMxN oiNewTile;
-    TileDataMxN oiTile;
-
-    // Scalar DN tiles loaded from GM (ColMajor)
-    TileScalarDN mijDN, lijDN, miDN, liDN;
-
-    // Temporary DN tiles for results
-    TileScalarDN miNewDN, alphaDN, betaDN, liNewDN, tmpDN;
-
-    TASSIGN(oiNewTile, 0);
-    TASSIGN(oiTile, kDataBytes);
-    TASSIGN(mijDN, 2 * kDataBytes);
-    TASSIGN(lijDN, 2 * kDataBytes + kScalarDNBytes);
-    TASSIGN(miDN, 2 * kDataBytes + 2 * kScalarDNBytes);
-    TASSIGN(liDN, 2 * kDataBytes + 3 * kScalarDNBytes);
-    TASSIGN(miNewDN, 2 * kDataBytes + 4 * kScalarDNBytes);
-    TASSIGN(alphaDN, 2 * kDataBytes + 5 * kScalarDNBytes);
-    TASSIGN(betaDN, 2 * kDataBytes + 6 * kScalarDNBytes);
-    TASSIGN(liNewDN, 2 * kDataBytes + 7 * kScalarDNBytes);
-    TASSIGN(tmpDN, 2 * kDataBytes + 8 * kScalarDNBytes);
-
-    if (is_first) {
-        // --- First block: copy inputs to accumulators ---
-        TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(mijDN, mijGlobalDN);
-        TLOAD(lijDN, lijGlobalDN);
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        // Store mi = mij, li = lij, oi = oi_new
-        // Alias ND tiles to the same UB as DN tiles for storing as ND format
-        TileScalarND mijND, lijND;
-        TASSIGN(mijND, 2 * kDataBytes);                   // alias same UB as mijDN
-        TASSIGN(lijND, 2 * kDataBytes + kScalarDNBytes);  // alias same UB as lijDN
-
-        set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-        TSTORE(miGlobalND, mijND);    // mi = mij
-        TSTORE(liGlobalND, lijND);    // li = lij
-        TSTORE(oiGlobal, oiNewTile);  // oi = oi_new
-
-        if (is_last) {
-            // Single block: normalize dst = oi_new / lij
-            // lijDN already in ColMajor DN format, use directly for TROWEXPANDDIV
-            set_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
-            wait_flag(PIPE_MTE3, PIPE_V, EVENT_ID0);
-            TROWEXPANDDIV(oiNewTile, oiNewTile, lijDN);
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-            TSTORE(dstGlobal, oiNewTile);
-        }
-    } else {
-        // --- Subsequent blocks: accumulate ---
-
-        // Load all inputs
-        TLOAD(oiNewTile, oiNewGlobal);
-        TLOAD(oiTile, oiGlobal);
-        TLOAD(mijDN, mijGlobalDN);
-        TLOAD(lijDN, lijGlobalDN);
-        TLOAD(miDN, miGlobalDN);
-        TLOAD(liDN, liGlobalDN);
-        set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-        wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-        // TRESHAPE: ColMajor(M,1) → RowMajor(1,M) for element-wise arithmetic
-        TileScalarRow miRow, mijRow, liRow, lijRow;
-        TRESHAPE(miRow, miDN);
-        TRESHAPE(mijRow, mijDN);
-        TRESHAPE(liRow, liDN);
-        TRESHAPE(lijRow, lijDN);
-
-        // Scalar arithmetic in RowMajor (1, M) layout
-        TileScalarRow miNewRow, alphaRow, betaRow, liNewRow, tmpRow;
-        TASSIGN(miNewRow, 2 * kDataBytes + 4 * kScalarDNBytes);
-        TASSIGN(alphaRow, 2 * kDataBytes + 5 * kScalarDNBytes);
-        TASSIGN(betaRow, 2 * kDataBytes + 6 * kScalarDNBytes);
-        TASSIGN(liNewRow, 2 * kDataBytes + 7 * kScalarDNBytes);
-        TASSIGN(tmpRow, 2 * kDataBytes + 8 * kScalarDNBytes);
-
-        TMAX(miNewRow, miRow, mijRow);     // mi_new = max(mi, mij)
-        TSUB(alphaRow, miRow, miNewRow);   // alpha_exp = mi - mi_new
-        TEXP(alphaRow, alphaRow);          // alpha = exp(mi - mi_new)
-        TSUB(betaRow, mijRow, miNewRow);   // beta_exp = mij - mi_new
-        TEXP(betaRow, betaRow);            // beta = exp(mij - mi_new)
-        TMUL(tmpRow, alphaRow, liRow);     // alpha * li
-        TMUL(liNewRow, betaRow, lijRow);   // beta * lij
-        TADD(liNewRow, tmpRow, liNewRow);  // li_new = alpha*li + beta*lij
-
-        // TRESHAPE back: RowMajor(1,M) → ColMajor(M,1) for TROWEXPANDMUL
-        TRESHAPE(alphaDN, alphaRow);
-        TRESHAPE(betaDN, betaRow);
-
-        // Scale data tiles using row-broadcast multiply
-        TROWEXPANDMUL(oiTile, oiTile, alphaDN);       // oi *= alpha
-        TROWEXPANDMUL(oiNewTile, oiNewTile, betaDN);  // oi_new *= beta
-        TADD(oiTile, oiTile, oiNewTile);              // oi = alpha*oi + beta*oi_new
-
-        // Store mi_new and li_new to GM (ND format)
-        // Alias ND tiles to the same UB locations as miNewRow and liNewRow
-        TileScalarND miNewND, liNewND;
-        TASSIGN(miNewND, 2 * kDataBytes + 4 * kScalarDNBytes);
-        TASSIGN(liNewND, 2 * kDataBytes + 7 * kScalarDNBytes);
-
-        if (is_last) {
-            // Normalize and output: dst = oi / li_new
-            TRESHAPE(liNewDN, liNewRow);
-            TROWEXPANDDIV(oiTile, oiTile, liNewDN);
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            TSTORE(miGlobalND, miNewND);  // persist mi_new
-            TSTORE(liGlobalND, liNewND);  // persist li_new
-            TSTORE(dstGlobal, oiTile);
-        } else {
-            // Store updated accumulators
-            set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-            TSTORE(miGlobalND, miNewND);  // persist mi_new
-            TSTORE(liGlobalND, liNewND);  // persist li_new
-            TSTORE(oiGlobal, oiTile);
-        }
-    }
-
-    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *oi_new = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    __gm__ Tensor *mi = reinterpret_cast<__gm__ Tensor *>(args[3]);
-    __gm__ Tensor *li = reinterpret_cast<__gm__ Tensor *>(args[4]);
-    __gm__ Tensor *oi = reinterpret_cast<__gm__ Tensor *>(args[5]);
-    __gm__ Tensor *dst = reinterpret_cast<__gm__ Tensor *>(args[6]);
-    uint64_t is_first = static_cast<uint64_t>(args[7]);
-    uint64_t is_last = static_cast<uint64_t>(args[8]);
-    uint64_t q_tile_size = static_cast<uint64_t>(mij->shapes[0]);
-    // args[10] = head_dim (128)
-
-    if (q_tile_size == 16) {
-        online_update_impl<16, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
-    } else {
-        online_update_impl<64, 128>(mij, lij, oi_new, mi, li, oi, is_first, is_last, dst);
-    }
-}
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
deleted file mode 100644
index 0e6e6bd9c..000000000
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ /dev/null
@@ -1,146 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-// Softmax Preparation Kernel (AIV) with partial block masking
-//
-// Operates on (M, N) tile where M=q_tile_size, N=block_size:
-//   Case1: sij is (16, 128)
-//   Case2: sij is (64, 64)
-//
-// For partial blocks (valid_len < N), positions [valid_len, N) in sij are
-// filled with -inf via TFILLPAD_INPLACE before softmax, ensuring exp(-inf)=0
-// so that invalid key positions contribute zero attention weight.
-//
-// Computes:
-//   sij_masked = TFILLPAD(sij, valid_len, pad=-inf)
-//   sij_scale = sij_masked * scale
-//   mij = row_max(sij_scale)        -> (M, 1)
-//   pij = exp(sij_scale - mij)      -> (M, N)
-//   lij = row_sum(pij)              -> (M, 1)
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <int M, int N>
-static __aicore__ void softmax_prepare_impl(
-    __gm__ Tensor *sij, float scale_value, __gm__ Tensor *pij, __gm__ Tensor *mij, __gm__ Tensor *lij
-) {
-    uint64_t valid_len = static_cast<uint64_t>(sij->shapes[1]);
-    __gm__ float *sij_addr = reinterpret_cast<__gm__ float *>(sij->buffer.addr);
-    __gm__ bfloat16_t *pij_addr = reinterpret_cast<__gm__ bfloat16_t *>(pij->buffer.addr);
-    __gm__ float *mij_addr = reinterpret_cast<__gm__ float *>(mij->buffer.addr);
-    __gm__ float *lij_addr = reinterpret_cast<__gm__ float *>(lij->buffer.addr);
-
-    constexpr int kAlignedRows = ((M * sizeof(float) + 31) / 32) * (32 / sizeof(float));
-
-    using GlobalDataMxN = GlobalTensor<float, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
-    using GlobalDataMxN_bf16 = GlobalTensor<bfloat16_t, Shape<1, 1, 1, M, N>, pto::Stride<1, 1, 1, N, 1>>;
-    using GlobalScalarDN = GlobalTensor<float, Shape<1, 1, 1, kAlignedRows, 1>, pto::Stride<1, 1, 1, 1, 1>, Layout::DN>;
-
-    GlobalDataMxN sijGlobal(sij_addr + sij->start_offset);
-    GlobalDataMxN_bf16 pijGlobal(pij_addr + pij->start_offset);
-    GlobalScalarDN mijGlobal(mij_addr + mij->start_offset);
-    GlobalScalarDN lijGlobal(lij_addr + lij->start_offset);
-
-    // Dynamic-cols tile: marks which columns are valid for TFILLPAD boundary
-    using TileSijDyn = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, -1>;
-    // Padded tile: TFILLPAD_INPLACE fills positions [valid_len, N) with -inf
-    using TileSijPad = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N, SLayout::NoneBox, 512, PadValue::Min>;
-
-    using TileVecMxN = Tile<TileType::Vec, float, M, N, BLayout::RowMajor, M, N>;
-    using TileVecMxN_bf16 = Tile<TileType::Vec, bfloat16_t, M, N, BLayout::RowMajor, M, N>;
-    using TileScalarDN = Tile<TileType::Vec, float, kAlignedRows, 1, BLayout::ColMajor, M, 1>;
-
-    TileVecMxN sijTile;
-    TileSijDyn sijDynTile(static_cast<size_t>(valid_len));
-    TileSijPad sijPadTile;
-    TileVecMxN pijTile;
-    TileVecMxN tmpTile;
-    TileScalarDN maxTile;
-    TileScalarDN sumTile;
-    TileVecMxN_bf16 pijBf16Tile;
-
-    // All sij tiles share UB address 0x0 (in-place masking)
-    TASSIGN(sijTile, 0x0);
-    TASSIGN(sijDynTile, 0x0);
-    TASSIGN(sijPadTile, 0x0);
-    TASSIGN(pijTile, M * N * sizeof(float));
-    TASSIGN(tmpTile, 2 * M * N * sizeof(float));
-    TASSIGN(maxTile, 3 * M * N * sizeof(float));
-    TASSIGN(sumTile, 3 * M * N * sizeof(float) + kAlignedRows * sizeof(float));
-    TASSIGN(pijBf16Tile, 3 * M * N * sizeof(float) + 2 * kAlignedRows * sizeof(float));
-
-    // Load full sij (M, N) tile from GM - all N columns including garbage for partial blocks
-    // printf("sij addr incore %x\n", sij->buffer.addr);
-    TLOAD(sijTile, sijGlobal);
-    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-
-    // Mask columns [valid_len, N) with -inf. sijDynTile provides the valid boundary,
-    // sijPadTile provides PadValue::Min as the fill value. No-op when valid_len == N.
-    TFILLPAD_INPLACE(sijPadTile, sijDynTile);
-
-    TMULS(sijTile, sijTile, scale_value);
-    TROWMAX(maxTile, sijTile, tmpTile);
-    TROWEXPANDSUB(pijTile, sijTile, maxTile);
-    TEXP(pijTile, pijTile);
-    // Truncate pij to bf16 first
-    TCVT(pijBf16Tile, pijTile, RoundMode::CAST_ROUND);
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);  // pij bf16 ready, can store early
-
-    // Continue computing: bf16 → f32 and rowsum while pij store proceeds in parallel
-    TCVT(pijTile, pijBf16Tile, RoundMode::CAST_ROUND);
-    TROWSUM(sumTile, pijTile, tmpTile);
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);  // sum ready
-
-    // Store pij (overlaps with TCVT + TROWSUM above)
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(pijGlobal, pijBf16Tile);
-
-    // Store max and sum
-    TSTORE(mijGlobal, maxTile);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
-    TSTORE(lijGlobal, sumTile);
-
-    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *sij = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *pij = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *mij = reinterpret_cast<__gm__ Tensor *>(args[2]);
-    __gm__ Tensor *lij = reinterpret_cast<__gm__ Tensor *>(args[3]);
-    union {
-        uint64_t u;
-        float f;
-    } scale_conv;
-    scale_conv.u = static_cast<uint64_t>(args[4]);
-    float scale_value = scale_conv.f;
-    uint64_t q_tile_size = static_cast<uint64_t>(sij->shapes[0]);
-
-    if (q_tile_size == 16) {
-        softmax_prepare_impl<16, 128>(sij, scale_value, pij, mij, lij);
-    } else {
-        softmax_prepare_impl<64, 64>(sij, scale_value, pij, mij, lij);
-    }
-}
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
deleted file mode 100644
index b3314019a..000000000
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Paged Attention Orchestration Function - 16x16 Version
- *
- * Simplified for 16x16 framework-generated matmul kernels.
- * Each block processes a single 16x16 matmul operation.
- *
- * Memory Layout:
- *   Query: (batch, 16, 16) - one 16x16 tile per batch
- *   Key:   (total_blocks, 16, 16) - stored as K^T for direct matmul
- *   Value: (total_blocks, 16, 16) - direct format
- */
-
-#include <algorithm>
-#include <cinttypes>
-#include <cstdint>
-#include <cstring>
-
-#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
-
-#define FUNC_QK_MATMUL 0
-#define FUNC_SOFTMAX_PREPARE 1
-#define FUNC_PV_MATMUL 2
-#define FUNC_ONLINE_UPDATE 3
-constexpr uint64_t PLATFORM_PROF_SYS_CNT_FREQ = 50000000;  // 50 MHz
-
-inline double cycles_to_us(uint64_t cycles) {
-    return (static_cast<double>(cycles) / PLATFORM_PROF_SYS_CNT_FREQ) * 1000000.0;
-}
-
-inline uint64_t get_sys_cnt_aicpu() {
-    uint64_t ticks;
-    asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
-    return ticks;
-}
-
-#define CYCLE_COUNT_START() uint64_t _t0 = get_sys_cnt_aicpu(), _t1
-#define CYCLE_COUNT_LAP(acc)       \
-    do {                           \
-        _t1 = get_sys_cnt_aicpu(); \
-        acc += (_t1 - _t0);        \
-        _t0 = _t1;                 \
-    } while (0)
-
-extern "C" {
-
-__attribute__((visibility("default"))) PTO2OrchestrationConfig
-aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
-    (void)orch_args;  // NOLINT(readability/casting)
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 7,
-    };
-}
-
-__attribute__((visibility("default"))) void build_paged_attention_graph(const ChipStorageTaskArgs &orch_args) {
-    uint64_t prof_param_extract = 0;
-    uint64_t prof_ext_tensor = 0;
-    uint64_t prof_scope = 0;
-    uint64_t prof_make_tensor = 0;
-    uint64_t prof_tensor_view = 0;
-    uint64_t prof_param_setup = 0;
-    uint64_t prof_submit_task = 0;
-    int prof_submit_count = 0;
-    int prof_make_count = 0;
-    int prof_view_count = 0;
-
-    CYCLE_COUNT_START();
-
-    // Read dimensions from tensor metadata
-    uint64_t batch = orch_args.tensor(0).shapes[0];
-    uint64_t num_heads = orch_args.tensor(0).shapes[1];
-    uint64_t head_dim = orch_args.tensor(0).shapes[2];
-    DataType data_type = orch_args.tensor(0).dtype;
-
-    uint64_t block_size = orch_args.tensor(1).shapes[1];
-    uint64_t block_num = orch_args.tensor(3).shapes[1];
-
-    uint64_t scale_value = orch_args.scalar(0);
-
-    uint64_t q_head_num = num_heads;
-    uint64_t q_tile = std::min(num_heads, 128UL);
-    uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
-    CYCLE_COUNT_LAP(prof_param_extract);
-
-    LOG_ALWAYS(">>>>>> batch = %" PRIu64, batch);
-
-    // Reshape tensors for kernel consumption (2D flattened)
-    void *query_ptr = orch_args.tensor(0).data_as<void>();
-    void *kc_ptr = orch_args.tensor(1).data_as<void>();
-    void *vc_ptr = orch_args.tensor(2).data_as<void>();
-    void *out_ptr = orch_args.tensor(5).data_as<void>();
-
-    uint64_t total_blocks_count = orch_args.tensor(1).shapes[0];
-
-    uint32_t query_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
-    uint32_t key_cache_shapes[2] = {
-        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
-    };
-    uint32_t value_cache_shapes[2] = {
-        static_cast<uint32_t>(total_blocks_count * block_size), static_cast<uint32_t>(head_dim)
-    };
-    uint32_t out_shapes[2] = {static_cast<uint32_t>(batch * num_heads), static_cast<uint32_t>(head_dim)};
-    Tensor query = make_tensor_external(query_ptr, query_shapes, 2, data_type);
-    Tensor key_cache = make_tensor_external(kc_ptr, key_cache_shapes, 2, data_type);
-    Tensor value_cache = make_tensor_external(vc_ptr, value_cache_shapes, 2, data_type);
-    Tensor out = make_tensor_external(out_ptr, out_shapes, 2, DataType::FLOAT32);
-    CYCLE_COUNT_LAP(prof_ext_tensor);
-
-    uint32_t bt_shapes[2] = {static_cast<uint32_t>(batch), static_cast<uint32_t>(block_num)};
-    Tensor block_table =
-        make_tensor_external(orch_args.tensor(3).data_as<void>(), bt_shapes, 2, DataType::INT32, false);
-    uint32_t cl_shapes[1] = {static_cast<uint32_t>(batch)};
-    Tensor context_lens =
-        make_tensor_external(orch_args.tensor(4).data_as<void>(), cl_shapes, 1, DataType::INT32, false);
-
-    // Create infos are loop-invariant — shapes depend only on q_tile/head_dim/block_size
-    uint32_t tile2d_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(head_dim)};
-    uint32_t scalar_shapes[1] = {static_cast<uint32_t>(q_tile)};
-    uint32_t sij_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(block_size)};
-    TensorCreateInfo tile2d_ci(tile2d_shapes, 2, DataType::FLOAT32);
-    TensorCreateInfo scalar_ci(scalar_shapes, 1, DataType::FLOAT32);
-    TensorCreateInfo sij_ci(sij_shapes, 2, DataType::FLOAT32);
-    TensorCreateInfo pij_f16_ci(sij_shapes, 2, data_type);
-
-    prof_make_count += 4;
-    CYCLE_COUNT_LAP(prof_make_tensor);
-
-    int total_tasks = 0;
-
-    for (uint64_t b_idx = 0; b_idx < batch; b_idx++) {
-        uint32_t cl_idx[1] = {static_cast<uint32_t>(b_idx)};
-        uint64_t cur_seq = static_cast<uint64_t>(get_tensor_data<int32_t>(context_lens, 1, cl_idx));
-        uint64_t bn_this_batch = (cur_seq + block_size - 1) / block_size;
-        for (uint64_t q_idx = 0; q_idx < q_loop; q_idx++) {
-            PTO2_SCOPE() {
-                CYCLE_COUNT_LAP(prof_scope);
-                uint64_t cur_offset = b_idx * q_head_num + q_idx * q_tile;
-
-                uint32_t qi_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
-                Tensor qi = query.view(tile2d_shapes, qi_offsets);
-                uint32_t out_view_offsets[2] = {static_cast<uint32_t>(cur_offset), 0};
-                Tensor out_view = out.view(tile2d_shapes, out_view_offsets);
-                prof_view_count += 2;
-                CYCLE_COUNT_LAP(prof_tensor_view);
-
-                CYCLE_COUNT_LAP(prof_param_setup);
-                TaskOutputTensors alloc_outs = alloc_tensors(tile2d_ci, scalar_ci, scalar_ci);
-                const Tensor &oi = alloc_outs.get_ref(0);
-                const Tensor &li_update = alloc_outs.get_ref(1);
-                const Tensor &mi_update = alloc_outs.get_ref(2);
-                prof_submit_count++;
-                CYCLE_COUNT_LAP(prof_submit_task);
-
-                for (uint64_t bn = 0; bn < bn_this_batch; bn++) {
-                    PTO2_SCOPE_GUARD();
-
-                    uint32_t bt_idx[2] = {static_cast<uint32_t>(b_idx), static_cast<uint32_t>(bn)};
-                    uint64_t cur_block_idx = static_cast<uint64_t>(get_tensor_data<int32_t>(block_table, 2, bt_idx));
-                    uint64_t valid_len = std::min(block_size, cur_seq - bn * block_size);
-                    CYCLE_COUNT_LAP(prof_param_extract);
-
-                    uint32_t kv_shapes[2] = {static_cast<uint32_t>(block_size), static_cast<uint32_t>(head_dim)};
-                    uint32_t kv_offsets[2] = {static_cast<uint32_t>(cur_block_idx * block_size), 0};
-                    Tensor kj = key_cache.view(kv_shapes, kv_offsets);
-                    Tensor vj = value_cache.view(kv_shapes, kv_offsets);
-                    prof_view_count += 2;
-                    CYCLE_COUNT_LAP(prof_tensor_view);
-
-                    Arg params_qk;
-                    params_qk.add_input(qi);
-                    params_qk.add_input(kj);
-                    params_qk.add_output(sij_ci);
-                    CYCLE_COUNT_LAP(prof_param_setup);
-                    TaskOutputTensors qk_outs = pto2_rt_submit_aic_task(FUNC_QK_MATMUL, params_qk);
-                    const Tensor &sij = qk_outs.get_ref(0);
-                    prof_submit_count++;
-                    CYCLE_COUNT_LAP(prof_submit_task);
-
-                    uint32_t sij_valid_shapes[2] = {static_cast<uint32_t>(q_tile), static_cast<uint32_t>(valid_len)};
-                    uint32_t sij_valid_offsets[2] = {0, 0};
-                    Tensor sij_valid = sij.view(sij_valid_shapes, sij_valid_offsets);
-                    prof_view_count += 1;
-                    CYCLE_COUNT_LAP(prof_tensor_view);
-
-                    Arg params_sf;
-                    params_sf.add_input(sij_valid);
-                    params_sf.add_output(pij_f16_ci);
-                    params_sf.add_output(scalar_ci);
-                    params_sf.add_output(scalar_ci);
-                    params_sf.add_scalar(scale_value);
-                    CYCLE_COUNT_LAP(prof_param_setup);
-                    TaskOutputTensors sf_outs = pto2_rt_submit_aiv_task(FUNC_SOFTMAX_PREPARE, params_sf);
-                    const Tensor &pij_f16 = sf_outs.get_ref(0);
-                    const Tensor &mi = sf_outs.get_ref(1);
-                    const Tensor &li = sf_outs.get_ref(2);
-                    prof_submit_count++;
-                    CYCLE_COUNT_LAP(prof_submit_task);
-
-                    Arg params_pv;
-                    params_pv.add_input(pij_f16);
-                    params_pv.add_input(vj);
-                    params_pv.add_output(tile2d_ci);
-                    CYCLE_COUNT_LAP(prof_param_setup);
-                    TaskOutputTensors pv_outs = pto2_rt_submit_aic_task(FUNC_PV_MATMUL, params_pv);
-                    const Tensor &oi_tmp = pv_outs.get_ref(0);
-                    prof_submit_count++;
-                    CYCLE_COUNT_LAP(prof_submit_task);
-
-                    uint64_t is_first = (bn == 0) ? 1 : 0;
-                    uint64_t is_last = (bn == bn_this_batch - 1) ? 1 : 0;
-                    CYCLE_COUNT_LAP(prof_param_extract);
-
-                    Arg params_up;
-                    params_up.add_input(mi);
-                    params_up.add_input(li);
-                    params_up.add_input(oi_tmp);
-                    params_up.add_inout(mi_update);
-                    params_up.add_inout(li_update);
-                    params_up.add_inout(oi);
-                    params_up.add_inout(out_view);
-                    params_up.add_scalar(is_first);
-                    params_up.add_scalar(is_last);
-                    CYCLE_COUNT_LAP(prof_param_setup);
-                    pto2_rt_submit_aiv_task(FUNC_ONLINE_UPDATE, params_up);
-                    prof_submit_count++;
-                    CYCLE_COUNT_LAP(prof_submit_task);
-                }
-            }
-            CYCLE_COUNT_LAP(prof_scope);
-        }
-    }
-
-    uint64_t total = prof_param_extract + prof_ext_tensor + prof_make_tensor + prof_tensor_view + prof_param_setup +
-                     prof_submit_task + prof_scope;
-    LOG_ALWAYS(
-        "=== PagedAttn Orch Profiling: %d submits, %d makes, %d views, total=%.3fus ===", prof_submit_count,
-        prof_make_count, prof_view_count, cycles_to_us(total)
-    );
-    if (total > 0) {
-        LOG_ALWAYS(
-            "  param_extract    : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_extract),
-            prof_param_extract * 100.0 / total
-        );
-        LOG_ALWAYS(
-            "  ext_tensor(x4)   : %7.3fus (%5.1f%%)", cycles_to_us(prof_ext_tensor), prof_ext_tensor * 100.0 / total
-        );
-        LOG_ALWAYS(
-            "  create_info(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_make_count, cycles_to_us(prof_make_tensor),
-            prof_make_tensor * 100.0 / total,
-            prof_make_count > 0 ? cycles_to_us(prof_make_tensor) / prof_make_count : 0.0
-        );
-        LOG_ALWAYS(
-            "  tensor_view(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_view_count, cycles_to_us(prof_tensor_view),
-            prof_tensor_view * 100.0 / total,
-            prof_view_count > 0 ? cycles_to_us(prof_tensor_view) / prof_view_count : 0.0
-        );
-        LOG_ALWAYS(
-            "  param_setup      : %7.3fus (%5.1f%%)", cycles_to_us(prof_param_setup), prof_param_setup * 100.0 / total
-        );
-        LOG_ALWAYS("  scope            : %7.3fus (%5.1f%%)", cycles_to_us(prof_scope), prof_scope * 100.0 / total);
-        LOG_ALWAYS(
-            "  submit_task(x%d) : %7.3fus (%5.1f%%)  avg=%.3fus", prof_submit_count, cycles_to_us(prof_submit_task),
-            prof_submit_task * 100.0 / total,
-            prof_submit_count > 0 ? cycles_to_us(prof_submit_task) / prof_submit_count : 0.0
-        );
-    }
-
-#undef CYCLE_COUNT_START
-#undef CYCLE_COUNT_LAP
-}
-
-}  // extern "C"
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
deleted file mode 100644
index 4e3a52890..000000000
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""Paged attention — tensormap_and_ringbuffer test (production scale, bfloat16).
-
-AIC+AIV mixed execution with online softmax paged attention.
-Production-scale cases for A5 hardware validation.
-"""
-
-import torch
-from simpler.task_interface import ArgDirection as D
-
-from simpler_setup import Scalar, SceneTestCase, TaskArgsBuilder, Tensor, scene_test
-from simpler_setup.goldens.paged_attention import compute_golden as _pa_compute_golden
-from simpler_setup.goldens.paged_attention import generate_inputs as _pa_generate_inputs
-
-
-@scene_test(level=2, runtime="tensormap_and_ringbuffer")
-class TestPagedAttention(SceneTestCase):
-    """Paged attention with tensormap_and_ringbuffer runtime on A5."""
-
-    RTOL = 1e-3
-    ATOL = 1e-3
-
-    CALLABLE = {
-        "orchestration": {
-            "source": "kernels/orchestration/paged_attention_orch.cpp",
-            "function_name": "build_paged_attention_graph",
-            "signature": [D.IN, D.IN, D.IN, D.IN, D.IN, D.OUT],
-        },
-        "incores": [
-            {
-                "func_id": 0,
-                "source": "kernels/aic/aic_qk_matmul.cpp",
-                "core_type": "aic",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 2,
-                "source": "kernels/aic/aic_pv_matmul.cpp",
-                "core_type": "aic",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 1,
-                "source": "kernels/aiv/aiv_softmax_prepare.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.OUT, D.OUT, D.OUT],
-            },
-            {
-                "func_id": 3,
-                "source": "kernels/aiv/aiv_online_update.cpp",
-                "core_type": "aiv",
-                "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
-            },
-        ],
-    }
-
-    CASES = [
-        {
-            "name": "Case1",
-            "platforms": ["a5"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 24},
-            "params": {
-                "batch": 256,
-                "num_heads": 16,
-                "kv_head_num": 1,
-                "head_dim": 128,
-                "block_size": 128,
-                "context_len": 8192,
-                "max_model_len": 32768,
-                "dtype": "bfloat16",
-            },
-        },
-        {
-            "name": "Case2",
-            "platforms": ["a5"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 24},
-            "manual": True,
-            "params": {
-                "batch": 64,
-                "num_heads": 64,
-                "kv_head_num": 1,
-                "head_dim": 128,
-                "block_size": 64,
-                "context_len": 8192,
-                "max_model_len": 32768,
-                "dtype": "bfloat16",
-            },
-        },
-        {
-            "name": "Case3",
-            "platforms": ["a5"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 24},
-            "manual": True,
-            "params": {
-                "batch": 64,
-                "num_heads": 64,
-                "kv_head_num": 1,
-                "head_dim": 256,
-                "block_size": 64,
-                "context_len": 8192,
-                "max_model_len": 32768,
-                "dtype": "bfloat16",
-            },
-        },
-    ]
-
-    def generate_args(self, params):
-        inputs = _pa_generate_inputs(params)
-        specs = []
-        for name, val in inputs:
-            if isinstance(val, torch.Tensor):
-                specs.append(Tensor(name, val))
-            else:
-                specs.append(Scalar(name, val))
-        return TaskArgsBuilder(*specs)
-
-    def compute_golden(self, args, params):
-        tensors = {s.name: s.value for s in args.specs if isinstance(s, Tensor)}
-        _pa_compute_golden(tensors, params)
-        for s in args.specs:
-            if isinstance(s, Tensor) and s.name in tensors:
-                getattr(args, s.name)[:] = tensors[s.name]
-
-
-if __name__ == "__main__":
-    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aic/kernel_spmd_read.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aic/kernel_spmd_read.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aic/kernel_spmd_read.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aic/kernel_spmd_read.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aiv/kernel_spmd_read.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aiv/kernel_spmd_read.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aiv/kernel_spmd_read.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/aiv/kernel_spmd_read.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_basic/kernels/orchestration/spmd_basic_orch.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/kernels/orchestration/spmd_multiblock_aiv_orch.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/kernels/orchestration/spmd_multiblock_mix_orch.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/kernels/orchestration/spmd_starvation_orch.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/kernels/orchestration/spmd_sync_start_orch.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/kernels/orchestration/spmd_sync_start_aiv_orch.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/kernels/orchestration/spmd_sync_start_edge_orch.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/kernels/orchestration/spmd_sync_start_stress_orch.cpp
diff --git a/examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
similarity index 100%
rename from examples/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
rename to tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py

From fb39511f90dbe201d9445ded2ea06d0fa45e1921 Mon Sep 17 00:00:00 2001
From: majin0824 <majin15@huawei.com>
Date: Thu, 16 Apr 2026 21:06:06 +0800
Subject: [PATCH 3/5] fix: Complete the missing function names that were
 omitted during the migration process

- During the previous use case migration process, some kernels lacked the definition of function names.

- This submission has completed the missing names in the aic and aiv modules of test_*.py to maintain the integrity and consistency of the code.
---
 .../bgemm/test_bgemm.py                       |  2 ++
 .../paged_attention/test_paged_attention.py   |  4 +++
 .../bgemm/test_bgemm.py                       |  2 ++
 .../paged_attention/test_paged_attention.py   |  4 +++
 .../test_paged_attention_unroll.py            |  6 ++++
 .../paged_attention/test_paged_attention.py   |  4 +++
 .../test_batch_paged_attention.py             |  4 +++
 .../benchmark_bgemm/test_benchmark_bgemm.py   |  2 ++
 .../mixed_example/test_mixed_example.py       |  5 ++++
 .../test_multi_round_paged_attention.py       |  4 +++
 .../test_paged_attention_unroll.py            |  4 +++
 .../spmd_basic/test_spmd_basic.py             |  6 ++--
 .../test_spmd_multiblock_aiv.py               |  2 +-
 .../test_spmd_multiblock_mix.py               |  6 ++--
 .../spmd_starvation/test_spmd_starvation.py   | 21 ++++++++++++--
 .../spmd_sync_start/test_spmd_sync_start.py   | 21 ++++++++++++--
 .../test_spmd_sync_start_aiv.py               |  7 ++++-
 .../test_spmd_sync_start_edge.py              | 21 ++++++++++++--
 .../test_spmd_sync_start_stress.py            | 28 ++++++++++++++++---
 .../paged_attention/test_paged_attention.py   |  4 +++
 .../mixed_example/test_mixed_example.py       |  5 ++++
 .../test_paged_attention_unroll.py            |  4 +++
 .../spmd_basic/test_spmd_basic.py             |  6 ++--
 .../test_spmd_multiblock_aiv.py               |  2 +-
 .../test_spmd_multiblock_mix.py               |  6 ++--
 .../spmd_starvation/test_spmd_starvation.py   | 21 ++++++++++++--
 .../spmd_sync_start/test_spmd_sync_start.py   | 21 ++++++++++++--
 .../test_spmd_sync_start_aiv.py               |  7 ++++-
 .../test_spmd_sync_start_edge.py              | 21 ++++++++++++--
 .../test_spmd_sync_start_stress.py            | 28 ++++++++++++++++---
 30 files changed, 236 insertions(+), 42 deletions(-)

diff --git a/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py b/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py
index f3e2d1c31..276f71175 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py
@@ -36,12 +36,14 @@ class TestBgemm(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "GEMM",
                 "source": "kernels/aic/kernel_gemm_tile.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "ADD",
                 "source": "kernels/aiv/kernel_tile_add.cpp",
                 "core_type": "aiv",
                 "signature": [D.INOUT, D.IN],
diff --git a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
index ee58ece6a..559de8522 100644
--- a/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
@@ -31,24 +31,28 @@ class TestPagedAttention(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "QK",
                 "source": "kernels/aic/aic_qk_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "PV",
                 "source": "kernels/aiv/aiv_softmax_prepare.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.OUT, D.OUT, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "SF",
                 "source": "kernels/aic/aic_pv_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "UP",
                 "source": "kernels/aiv/aiv_online_update.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
diff --git a/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py b/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py
index d7bc46a59..9601fcdf5 100644
--- a/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py
+++ b/examples/a5/tensormap_and_ringbuffer/bgemm/test_bgemm.py
@@ -37,12 +37,14 @@ class TestBgemm(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "GEMM",
                 "source": "kernels/mix/kernel_bgemm.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "ADD",
                 "source": "kernels/mix/kernel_bgemm.cpp",
                 "core_type": "aiv",
                 "signature": [D.INOUT, D.IN],
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
index a877c3ab2..3579a2d6a 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/test_paged_attention.py
@@ -37,24 +37,28 @@ class TestPagedAttention(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "QK",
                 "source": "kernels/aic/aic_qk_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "PV",
                 "source": "kernels/aic/aic_pv_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "SF",
                 "source": "kernels/aiv/aiv_softmax_prepare.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.OUT, D.OUT, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "UP",
                 "source": "kernels/aiv/aiv_online_update.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
diff --git a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py
index c8d78acbe..d0b982df0 100644
--- a/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py
+++ b/tests/st/a2a3/aicpu_build_graph/paged_attention_unroll/test_paged_attention_unroll.py
@@ -37,36 +37,42 @@ class TestPagedAttentionUnrollAicpuBuildGraph(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "QK",
                 "source": "kernels/aic/aic_qk_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "PV",
                 "source": "kernels/aic/aic_pv_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 4,
+                "name": "AIC_HUB",
                 "source": "kernels/aic/aic_hub.cpp",
                 "core_type": "aic",
                 "signature": [],
             },
             {
                 "func_id": 1,
+                "name": "SF",
                 "source": "kernels/aiv/aiv_softmax_prepare.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.OUT, D.OUT, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "UP",
                 "source": "kernels/aiv/aiv_online_update.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
             },
             {
                 "func_id": 5,
+                "name": "AIV_HUB",
                 "source": "kernels/aiv/aiv_hub.cpp",
                 "core_type": "aiv",
                 "signature": [],
diff --git a/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py b/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py
index 13b5159b7..232b68b29 100644
--- a/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py
+++ b/tests/st/a2a3/host_build_graph/paged_attention/test_paged_attention.py
@@ -37,24 +37,28 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "QK",
                 "source": "kernels/aic/aic_qk_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "PV",
                 "source": "kernels/aic/aic_pv_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "SF",
                 "source": "kernels/aiv/aiv_softmax_prepare.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.OUT, D.OUT, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "UP",
                 "source": "kernels/aiv/aiv_online_update.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py b/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py
index cc1ed20e9..ecee598fd 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/batch_paged_attention/test_batch_paged_attention.py
@@ -31,24 +31,28 @@ class TestBatchPagedAttention(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "QK",
                 "source": "kernels/aic/aic_qk_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "PV",
                 "source": "kernels/aiv/aiv_softmax_prepare.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.OUT, D.OUT, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "SF",
                 "source": "kernels/aic/aic_pv_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "UP",
                 "source": "kernels/aiv/aiv_online_update.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py b/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py
index 514e2189a..05ea2d7a2 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py
@@ -29,12 +29,14 @@ class TestBenchmarkBgemm(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "GEMM",
                 "source": "kernels/aic/kernel_gemm_tile.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "ADD",
                 "source": "kernels/aiv/kernel_tile_add.cpp",
                 "core_type": "aiv",
                 "signature": [D.INOUT, D.IN],
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py b/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
index daf598969..da21e903c 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
@@ -51,30 +51,35 @@ class TestMixedExample(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "MATMUL",
                 "source": "kernels/aic/kernel_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "ADD",
                 "source": "kernels/aiv/kernel_add.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "MUL",
                 "source": "kernels/aiv/kernel_mul.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "ADD_STANDALONE",
                 "source": "kernels/aiv/kernel_add_standalone.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 4,
+                "name": "MUL_STANDALONE",
                 "source": "kernels/aiv/kernel_mul_standalone.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.OUT],
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/multi_round_paged_attention/test_multi_round_paged_attention.py b/tests/st/a2a3/tensormap_and_ringbuffer/multi_round_paged_attention/test_multi_round_paged_attention.py
index b9520e5af..a78b91de5 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/multi_round_paged_attention/test_multi_round_paged_attention.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/multi_round_paged_attention/test_multi_round_paged_attention.py
@@ -36,24 +36,28 @@ class TestMultiRoundPagedAttention(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "QK",
                 "source": f"{_PA_KERNELS}/aic/aic_qk_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "PV",
                 "source": f"{_PA_KERNELS}/aiv/aiv_softmax_prepare.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.OUT, D.OUT, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "SF",
                 "source": f"{_PA_KERNELS}/aic/aic_pv_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "UP",
                 "source": f"{_PA_KERNELS}/aiv/aiv_online_update.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py
index 847882d0a..f9dc66c54 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py
@@ -31,24 +31,28 @@ class TestPagedAttentionUnroll(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "QK",
                 "source": "kernels/aic/aic_qk_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "PV",
                 "source": "kernels/aiv/aiv_softmax_prepare.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.OUT, D.OUT, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "SF",
                 "source": "kernels/aic/aic_pv_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "UP",
                 "source": "kernels/aiv/aiv_online_update.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
index a35358e22..39ecfb73c 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
@@ -40,9 +40,9 @@ class TestSpmdBasic(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "kernels/aic/kernel_spmd_read.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
+            {"func_id": 0, "name": "SPMD_READ_AIC", "source": "kernels/aic/kernel_spmd_read.cpp", "core_type": "aic"},
+            {"func_id": 1, "name": "SPMD_READ_AIV0", "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
+            {"func_id": 2, "name": "SPMD_READ_AIV1", "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
         ],
     }
 
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
index f3d74a142..63b5f3ea8 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
@@ -35,7 +35,7 @@ class TestSpmdMultiblockAiv(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
+            {"func_id": 0, "name": "SPMD_WRITE_AIV", "source": "kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
         ],
     }
 
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
index edb931451..c0cc20cd6 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
@@ -36,9 +36,9 @@ class TestSpmdMultiblockMix(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 0, "name": "SPMD_MIX_AIC", "source": "kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
+            {"func_id": 1, "name": "SPMD_MIX_AIV0", "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 2, "name": "SPMD_MIX_AIV1", "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
         ],
     }
 
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
index d952c905f..1b3e5ff8a 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
@@ -57,9 +57,24 @@ class TestSpmdStarvation(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
         ],
     }
 
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
index 1aa0758d9..f8bf33830 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
@@ -32,9 +32,24 @@ class TestSpmdSyncStart(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
         ],
     }
 
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
index 3f9b0272b..e35b004a1 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
@@ -31,7 +31,12 @@ class TestSpmdSyncStartAiv(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
+            {
+                "func_id": 0,
+                "name": "SPMD_WRITE_AIV",
+                "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp",
+                "core_type": "aiv",
+            },
         ],
     }
 
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
index 550ac3211..5ebfb87b0 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
@@ -32,9 +32,24 @@ class TestSpmdSyncStartEdge(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
         ],
     }
 
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
index a230c8264..c8e46a624 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
+++ b/tests/st/a2a3/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
@@ -57,10 +57,30 @@ class TestSpmdSyncStartStress(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 3, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 3,
+                "name": "SPMD_WRITE_AIV",
+                "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp",
+                "core_type": "aiv",
+            },
         ],
     }
 
diff --git a/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py
index 143092ce5..54d7afc39 100644
--- a/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py
+++ b/tests/st/a5/host_build_graph/paged_attention/test_paged_attention.py
@@ -37,24 +37,28 @@ class TestPagedAttentionHostBuildGraph(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "QK",
                 "source": "kernels/aic/aic_qk_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "PV",
                 "source": "kernels/aic/aic_pv_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "SF",
                 "source": "kernels/aiv/aiv_softmax_prepare.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.OUT, D.OUT, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "UP",
                 "source": "kernels/aiv/aiv_online_update.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
diff --git a/tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
index 37a8a92ed..be7c792ee 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/mixed_example/test_mixed_example.py
@@ -57,30 +57,35 @@ class TestMixedExample(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "MATMUL",
                 "source": "kernels/aic/kernel_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "ADD",
                 "source": "kernels/aiv/kernel_add.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "MUL",
                 "source": "kernels/aiv/kernel_mul.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "ADD_STANDALONE",
                 "source": "kernels/aiv/kernel_add_standalone.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 4,
+                "name": "MUL_STANDALONE",
                 "source": "kernels/aiv/kernel_mul_standalone.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.OUT],
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py
index f79a98c0d..5421f9245 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/test_paged_attention_unroll.py
@@ -36,24 +36,28 @@ class TestPagedAttentionUnroll(SceneTestCase):
         "incores": [
             {
                 "func_id": 0,
+                "name": "QK",
                 "source": "kernels/aic/aic_qk_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 2,
+                "name": "PV",
                 "source": "kernels/aic/aic_pv_matmul.cpp",
                 "core_type": "aic",
                 "signature": [D.IN, D.IN, D.OUT],
             },
             {
                 "func_id": 1,
+                "name": "SF",
                 "source": "kernels/aiv/aiv_softmax_prepare.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.OUT, D.OUT, D.OUT],
             },
             {
                 "func_id": 3,
+                "name": "UP",
                 "source": "kernels/aiv/aiv_online_update.cpp",
                 "core_type": "aiv",
                 "signature": [D.IN, D.IN, D.IN, D.INOUT, D.INOUT, D.INOUT, D.INOUT],
diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
index 55d4cbfb7..e62ecee01 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_basic/test_spmd_basic.py
@@ -38,9 +38,9 @@ class TestSpmdBasic(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "kernels/aic/kernel_spmd_read.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
+            {"func_id": 0, "name": "SPMD_READ_AIC", "source": "kernels/aic/kernel_spmd_read.cpp", "core_type": "aic"},
+            {"func_id": 1, "name": "SPMD_READ_AIV0", "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
+            {"func_id": 2, "name": "SPMD_READ_AIV1", "source": "kernels/aiv/kernel_spmd_read.cpp", "core_type": "aiv"},
         ],
     }
 
diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
index 58becb0b8..254a37d55 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_aiv/test_spmd_multiblock_aiv.py
@@ -48,7 +48,7 @@ class TestSpmdMultiblockAiv(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
+            {"func_id": 0, "name": "SPMD_WRITE_AIV", "source": "kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
         ],
     }
 
diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
index 1bac22c74..0ef57c2c6 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_multiblock_mix/test_spmd_multiblock_mix.py
@@ -50,9 +50,9 @@ class TestSpmdMultiblockMix(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 0, "name": "SPMD_MIX_AIC", "source": "kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
+            {"func_id": 1, "name": "SPMD_MIX_AIV0", "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {"func_id": 2, "name": "SPMD_MIX_AIV1", "source": "kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
         ],
     }
 
diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
index 425ccdab0..06d8a541c 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_starvation/test_spmd_starvation.py
@@ -69,9 +69,24 @@ class TestSpmdStarvation(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
         ],
     }
 
diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
index 18320397e..d4592cef7 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start/test_spmd_sync_start.py
@@ -48,9 +48,24 @@ class TestSpmdSyncStart(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
         ],
     }
 
diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
index 8a434caa5..7d0c2b314 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_aiv/test_spmd_sync_start_aiv.py
@@ -49,7 +49,12 @@ class TestSpmdSyncStartAiv(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
+            {
+                "func_id": 0,
+                "name": "SPMD_WRITE_AIV",
+                "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp",
+                "core_type": "aiv",
+            },
         ],
     }
 
diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
index 11a728a02..35497a419 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_edge/test_spmd_sync_start_edge.py
@@ -53,9 +53,24 @@ class TestSpmdSyncStartEdge(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
         ],
     }
 
diff --git a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
index a87eb7209..4c7c9c789 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
+++ b/tests/st/a5/tensormap_and_ringbuffer/spmd_sync_start_stress/test_spmd_sync_start_stress.py
@@ -75,10 +75,30 @@ class TestSpmdSyncStartStress(SceneTestCase):
             "signature": [D.INOUT],
         },
         "incores": [
-            {"func_id": 0, "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp", "core_type": "aic"},
-            {"func_id": 1, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 2, "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp", "core_type": "aiv"},
-            {"func_id": 3, "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp", "core_type": "aiv"},
+            {
+                "func_id": 0,
+                "name": "SPMD_MIX_AIC",
+                "source": "../spmd_multiblock_mix/kernels/aic/kernel_spmd_mix.cpp",
+                "core_type": "aic",
+            },
+            {
+                "func_id": 1,
+                "name": "SPMD_MIX_AIV0",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 2,
+                "name": "SPMD_MIX_AIV1",
+                "source": "../spmd_multiblock_mix/kernels/aiv/kernel_spmd_mix.cpp",
+                "core_type": "aiv",
+            },
+            {
+                "func_id": 3,
+                "name": "SPMD_WRITE_AIV",
+                "source": "../spmd_multiblock_aiv/kernels/aiv/kernel_spmd_write.cpp",
+                "core_type": "aiv",
+            },
         ],
     }
 

From 949fcfe476192e1066bebb7e4d82542e8581b13f Mon Sep 17 00:00:00 2001
From: majin0824 <majin15@huawei.com>
Date: Fri, 17 Apr 2026 11:02:48 +0800
Subject: [PATCH 4/5] Refactor: merge bgemm into benchmark_bgemm and fix a5
 paged_attention
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Delete examples/a2a3/bgemm (fixed-config), move benchmark_bgemm
  from tests/st to examples/a2a3 with a Bgemm64 case covering the
  old example config (tile=64, grid_k=4, block_dim=3)
- Add platform guards for aarch64 timer asm in a5 paged_attention
  orchestration files (mrs cntvct_el0 → rdtsc on x86_64)
---
 .github/workflows/ci.yml                      |   4 +-
 .../kernels/aic/kernel_gemm_tile.cpp          |   0
 .../kernels/aiv/kernel_tile_add.cpp           |   0
 .../kernels/orchestration/bgemm_orch.cpp      |   0
 .../benchmark_bgemm/test_benchmark_bgemm.py   |   6 +
 .../bgemm/kernels/aic/kernel_gemm_tile.cpp    | 121 -----------------
 .../bgemm/kernels/aiv/kernel_tile_add.cpp     |  74 -----------
 .../kernels/orchestration/bgemm_orch.cpp      | 124 ------------------
 .../bgemm/test_bgemm.py                       |  82 ------------
 .../kernels/aiv/aiv_softmax_prepare.cpp       |   1 -
 .../orchestration/paged_attention_orch.cpp    |  10 +-
 .../orchestration/paged_attention_orch.cpp    |  10 +-
 12 files changed, 26 insertions(+), 406 deletions(-)
 rename {tests/st => examples}/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp (100%)
 rename {tests/st => examples}/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp (100%)
 rename {tests/st => examples}/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp (100%)
 rename {tests/st => examples}/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py (94%)
 delete mode 100644 examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp
 delete mode 100644 examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp
 delete mode 100644 examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
 delete mode 100644 examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 1a2d03e36..2869c3f93 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -267,7 +267,7 @@ jobs:
           pip install '.[test]'
 
       - name: Run simulation examples (a5sim)
-        run: python ci.py -p a5sim -c d96c8784 -t 600 --clone-protocol https
+        run: python ci.py -p a5sim -c 3cf259e8 -t 600 --clone-protocol https
 
       - name: Run pytest scene tests (a5sim)
         run: |
@@ -277,7 +277,7 @@ jobs:
           if [ $rc -eq 124 ]; then
             echo "pytest timed out; retrying with pinned PTO-ISA commit"
             pytest examples tests/st --platform a5sim --device 0-15 -v \
-              --pto-session-timeout 600 --pto-isa-commit d96c8784 --clone-protocol https
+              --pto-session-timeout 600 --pto-isa-commit 3cf259e8 --clone-protocol https
             rc=$?
           fi
           exit $rc
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp
similarity index 100%
rename from tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aic/kernel_gemm_tile.cpp
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp
similarity index 100%
rename from tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/aiv/kernel_tile_add.cpp
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
similarity index 100%
rename from tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
rename to examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/kernels/orchestration/bgemm_orch.cpp
diff --git a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py
similarity index 94%
rename from tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py
rename to examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py
index 05ea2d7a2..a3b888f75 100644
--- a/tests/st/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py
+++ b/examples/a2a3/tensormap_and_ringbuffer/benchmark_bgemm/test_benchmark_bgemm.py
@@ -79,6 +79,12 @@ class TestBenchmarkBgemm(SceneTestCase):
             "config": {"aicpu_thread_num": 4, "block_dim": 24},
             "params": {"matmul_add_task_num": 64, "incore_data_size": 128, "incore_loop": 4, "grid_k": 4},
         },
+        {
+            "name": "Bgemm64",
+            "platforms": ["a2a3sim", "a2a3"],
+            "config": {"aicpu_thread_num": 4, "block_dim": 3},
+            "params": {"matmul_add_task_num": 32, "incore_data_size": 64, "incore_loop": 1, "grid_k": 4},
+        },
     ]
 
     def generate_args(self, params):
diff --git a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp
deleted file mode 100644
index 56077fc90..000000000
--- a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aic/kernel_gemm_tile.cpp
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Tile-based Matrix Multiplication Kernel (Cube Core)
- *
- * Computes: output = input_a @ input_b (64x64 tile matmul)
- * Uses TMATMUL instruction
- *
- * Args (Tensor*):
- *   args[0] = input_a (INPUT)
- *   args[1] = input_b (INPUT)
- *   args[2] = output  (OUTPUT)
- */
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-#include <pto/common/constants.hpp>
-#include <pto/common/pto_tile.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-template <typename T>
-AICORE constexpr inline T CeilAlign(T num_1, T num_2) {
-    if (num_2 == 0) {
-        return 0;
-    }
-    return (num_1 + num_2 - 1) / num_2 * num_2;
-}
-
-static __aicore__ void
-gemm_tile_impl(__gm__ Tensor *input_a_tensor, __gm__ Tensor *input_b_tensor, __gm__ Tensor *output_tensor) {
-    __gm__ float *input_a =
-        reinterpret_cast<__gm__ float *>(input_a_tensor->buffer.addr) + input_a_tensor->start_offset;
-    __gm__ float *input_b =
-        reinterpret_cast<__gm__ float *>(input_b_tensor->buffer.addr) + input_b_tensor->start_offset;
-    __gm__ float *output = reinterpret_cast<__gm__ float *>(output_tensor->buffer.addr) + output_tensor->start_offset;
-
-    constexpr int TILE = 64;
-    constexpr int blockAlign = C0_SIZE_BYTE / sizeof(float);
-    constexpr int M = CeilAlign<int>(TILE, 16);
-    constexpr int K = CeilAlign<int>(TILE, blockAlign);
-    constexpr int N = CeilAlign<int>(TILE, blockAlign);
-
-    using GlobalDataA =
-        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
-    using GlobalDataB =
-        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
-    using GlobalDataC =
-        GlobalTensor<float, Shape<1, 1, 1, TILE, TILE>, Stride<1 * TILE * TILE, 1 * TILE * TILE, TILE * TILE, TILE, 1>>;
-
-    GlobalDataA src0Global(input_a);
-    GlobalDataB src1Global(input_b);
-    GlobalDataC dstGlobal(output);
-
-    using TileMatA = Tile<TileType::Mat, float, M, K, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
-    using TileMatB = Tile<TileType::Mat, float, K, N, BLayout::ColMajor, TILE, TILE, SLayout::RowMajor, 512>;
-
-    using LeftTile = TileLeft<float, M, K, TILE, TILE>;
-    using RightTile = TileRight<float, K, N, TILE, TILE>;
-    using AccTile = TileAcc<float, M, N, TILE, TILE>;
-
-    TileMatA aMatTile;
-    TileMatB bMatTile;
-    TASSIGN(aMatTile, 0x0);
-    TASSIGN(bMatTile, 0x20000);
-
-    LeftTile aTile;
-    RightTile bTile;
-    AccTile cTile;
-    TASSIGN(aTile, 0x0);
-    TASSIGN(bTile, 0x0);
-    TASSIGN(cTile, 0x0);
-
-    TLOAD(aMatTile, src0Global);
-    TLOAD(bMatTile, src1Global);
-
-    set_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_MTE1, EVENT_ID0);
-
-    TMOV(aTile, aMatTile);
-    TMOV(bTile, bMatTile);
-
-    set_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-    wait_flag(PIPE_MTE1, PIPE_M, EVENT_ID0);
-
-    TMATMUL(cTile, aTile, bTile);
-
-    set_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-    wait_flag(PIPE_M, PIPE_FIX, EVENT_ID0);
-
-    TSTORE(dstGlobal, cTile);
-
-    set_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_FIX, PIPE_S, EVENT_ID7);
-}
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *input_a = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *input_b = reinterpret_cast<__gm__ Tensor *>(args[1]);
-    __gm__ Tensor *output = reinterpret_cast<__gm__ Tensor *>(args[2]);
-
-    gemm_tile_impl(input_a, input_b, output);
-}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp
deleted file mode 100644
index 2dce84dcd..000000000
--- a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/aiv/kernel_tile_add.cpp
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * Tile-based Element-wise Addition Kernel (Vector Core) - INOUT Pattern
- *
- * Computes: C_tile = C_tile + P (64x64 tile accumulation)
- * Uses TADD instruction
- *
- * Args (Tensor*):
- *   args[0] = C_tile (INOUT: read + write accumulator)
- *   args[1] = P      (INPUT: matmul result to accumulate)
- */
-
-#include <cstdint>
-#include <pto/pto-inst.hpp>
-#include <pto/common/constants.hpp>
-
-#include "tensor.h"
-
-using namespace pto;
-
-#ifndef __gm__
-#define __gm__
-#endif
-
-#ifndef __aicore__
-#define __aicore__ [aicore]
-#endif
-
-extern "C" __aicore__ void kernel_entry(__gm__ int64_t *args) {
-    __gm__ Tensor *c_tensor = reinterpret_cast<__gm__ Tensor *>(args[0]);
-    __gm__ Tensor *p_tensor = reinterpret_cast<__gm__ Tensor *>(args[1]);
-
-    __gm__ float *c_ptr = reinterpret_cast<__gm__ float *>(c_tensor->buffer.addr) + c_tensor->start_offset;
-    __gm__ float *p_ptr = reinterpret_cast<__gm__ float *>(p_tensor->buffer.addr) + p_tensor->start_offset;
-
-    constexpr int TILE = 64;
-
-    using DynShapeDim5 = Shape<1, 1, 1, TILE, TILE>;
-    using DynStridDim5 = Stride<1, 1, 1, TILE, 1>;
-    using GlobalData = GlobalTensor<float, DynShapeDim5, DynStridDim5>;
-    using TileData = Tile<TileType::Vec, float, TILE, TILE, BLayout::RowMajor, -1, -1>;
-
-    TileData cTile(TILE, TILE);
-    TileData pTile(TILE, TILE);
-    TileData outTile(TILE, TILE);
-    TASSIGN(cTile, 0x0);
-    TASSIGN(pTile, 0x10000);
-    TASSIGN(outTile, 0x20000);
-
-    GlobalData cGlobal(c_ptr);
-    GlobalData pGlobal(p_ptr);
-    GlobalData outGlobal(c_ptr);  // write back to same C location
-
-    TLOAD(cTile, cGlobal);
-    TLOAD(pTile, pGlobal);
-    set_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    wait_flag(PIPE_MTE2, PIPE_V, EVENT_ID0);
-    TADD(outTile, cTile, pTile);
-    set_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID0);
-    TSTORE(outGlobal, outTile);
-
-    set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-    wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
-}
diff --git a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp b/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
deleted file mode 100644
index 452e472fe..000000000
--- a/examples/a2a3/tensormap_and_ringbuffer/bgemm/kernels/orchestration/bgemm_orch.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) PyPTO Contributors.
- * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
- * CANN Open Software License Agreement Version 2.0 (the "License").
- * Please refer to the License for details. You may not use this file except in compliance with the License.
- * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
- * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
- * See LICENSE in the root of the software repository for the full text of the License.
- * -----------------------------------------------------------------------------------------------------------
- */
-/**
- * BGEMM Orchestration Function (tensormap_and_ringbuffer Runtime)
- *
- * Builds the task graph for tiled matrix multiplication: C = A @ B
- *
- * Configuration:
- *   - Tile size: 64 x 64
- *   - Grid: 4 x 4 x 4 (GRID_M x GRID_K x GRID_N)
- *   - Batch: 2
- *
- * Memory layout (tile-first, 5D flattened):
- *   A: [BATCH, GRID_M, GRID_K, TILE, TILE]
- *   B: [BATCH, GRID_K, GRID_N, TILE, TILE]
- *   C: [BATCH, GRID_M, GRID_N, TILE, TILE]
- *
- * Task graph per output tile C[batch, m, n]:
- *   for k in [0, GRID_K):
- *     P = A[m,k] @ B[k,n]    (gemm_tile on Cube core, func_id=0)
- *     C[m,n] = C[m,n] + P    (tile_add on Vector core, func_id=1)
- *
- * Dependencies are automatic via TensorMap overlap detection.
- *
- * Arg layout: [A, B, C]  — shape/dtype/size in tensor metadata
- */
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "pto_orchestration_api.h"  // NOLINT(build/include_subdir)
-
-#define FUNC_GEMM_TILE 0
-#define FUNC_TILE_ADD 1
-
-// Grid and tile constants
-static constexpr int TILE = 64;
-static constexpr int GRID_M = 4;
-static constexpr int GRID_K = 4;
-static constexpr int GRID_N = 4;
-static constexpr int BATCH = 2;
-
-static constexpr uint32_t TILE_ELEMS = TILE * TILE;  // 4096 elements
-
-extern "C" {
-
-__attribute__((visibility("default"))) PTO2OrchestrationConfig
-aicpu_orchestration_config(const ChipStorageTaskArgs &orch_args) {
-    (void)orch_args;  // NOLINT(readability/casting)
-    return PTO2OrchestrationConfig{
-        .expected_arg_count = 3,
-    };
-}
-
-__attribute__((visibility("default"))) void aicpu_orchestration_entry(const ChipStorageTaskArgs &orch_args) {
-    // 1D external tensors for the full A, B, C arrays
-    Tensor ext_A = from_tensor_arg(orch_args.tensor(0));
-    Tensor ext_B = from_tensor_arg(orch_args.tensor(1));
-    Tensor ext_C = from_tensor_arg(orch_args.tensor(2));
-
-    LOG_INFO("[bgemm_orch] Grid: %dx%dx%d, Batch: %d, Tile: %d", GRID_M, GRID_K, GRID_N, BATCH, TILE);
-
-    uint32_t tile_shapes[1] = {TILE_ELEMS};
-    TensorCreateInfo tile_ci(tile_shapes, 1, DataType::FLOAT32);
-
-    for (int batch = 0; batch < BATCH; batch++) {
-        for (int m_idx = 0; m_idx < GRID_M; m_idx++) {
-            for (int n_idx = 0; n_idx < GRID_N; n_idx++) {
-                PTO2_SCOPE() {
-                    uint32_t c_elem_offset = (static_cast<uint32_t>(batch) * GRID_M * GRID_N +
-                                              static_cast<uint32_t>(m_idx) * GRID_N + static_cast<uint32_t>(n_idx)) *
-                                             TILE_ELEMS;
-                    uint32_t c_view_offsets[1] = {c_elem_offset};
-                    Tensor C_view = ext_C.view(tile_shapes, c_view_offsets);
-
-                    for (int k_idx = 0; k_idx < GRID_K; k_idx++) {
-                        uint32_t a_elem_offset =
-                            (static_cast<uint32_t>(batch) * GRID_M * GRID_K + static_cast<uint32_t>(m_idx) * GRID_K +
-                             static_cast<uint32_t>(k_idx)) *
-                            TILE_ELEMS;
-                        uint32_t b_elem_offset =
-                            (static_cast<uint32_t>(batch) * GRID_K * GRID_N + static_cast<uint32_t>(k_idx) * GRID_N +
-                             static_cast<uint32_t>(n_idx)) *
-                            TILE_ELEMS;
-
-                        uint32_t a_view_offsets[1] = {a_elem_offset};
-                        Tensor A_view = ext_A.view(tile_shapes, a_view_offsets);
-                        uint32_t b_view_offsets[1] = {b_elem_offset};
-                        Tensor B_view = ext_B.view(tile_shapes, b_view_offsets);
-                        // P = A[m,k] @ B[k,n]
-                        Arg params_gemm;
-                        params_gemm.add_input(A_view);
-                        params_gemm.add_input(B_view);
-                        params_gemm.add_output(tile_ci);
-                        TaskOutputTensors gemm_outs = pto2_rt_submit_aic_task(FUNC_GEMM_TILE,
-                                                                              params_gemm);  // gemm
-
-                        // C[m,n] += P
-                        Arg params_add;
-                        params_add.add_inout(C_view);
-                        params_add.add_input(gemm_outs.get_ref(0));
-                        pto2_rt_submit_aiv_task(FUNC_TILE_ADD,
-                                                params_add);  // add
-                    }
-                }
-            }
-        }
-    }
-
-    LOG_INFO(
-        "[bgemm_orch] Submitted tasks for %d batches, %dx%d output tiles, %d K steps each", BATCH, GRID_M, GRID_N,
-        GRID_K
-    );
-}
-
-}  // extern "C"
diff --git a/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py b/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py
deleted file mode 100644
index 276f71175..000000000
--- a/examples/a2a3/tensormap_and_ringbuffer/bgemm/test_bgemm.py
+++ /dev/null
@@ -1,82 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""BGEMM: batched tiled matrix multiplication C = A @ B.
-
-Fixed 4x4x4 grid with 64x64 tiles, 2 batches.
-"""
-
-import torch
-from simpler.task_interface import ArgDirection as D
-
-from simpler_setup import SceneTestCase, TaskArgsBuilder, Tensor, scene_test
-
-TILE_M, TILE_K, TILE_N = 64, 64, 64
-GRID_M, GRID_K, GRID_N = 4, 4, 4
-BATCH = 2
-
-
-@scene_test(level=2, runtime="tensormap_and_ringbuffer")
-class TestBgemm(SceneTestCase):
-    RTOL = 1e-3
-    ATOL = 1e-3
-
-    CALLABLE = {
-        "orchestration": {
-            "source": "kernels/orchestration/bgemm_orch.cpp",
-            "function_name": "aicpu_orchestration_entry",
-            "signature": [D.IN, D.IN, D.OUT],
-        },
-        "incores": [
-            {
-                "func_id": 0,
-                "name": "GEMM",
-                "source": "kernels/aic/kernel_gemm_tile.cpp",
-                "core_type": "aic",
-                "signature": [D.IN, D.IN, D.OUT],
-            },
-            {
-                "func_id": 1,
-                "name": "ADD",
-                "source": "kernels/aiv/kernel_tile_add.cpp",
-                "core_type": "aiv",
-                "signature": [D.INOUT, D.IN],
-            },
-        ],
-    }
-
-    CASES = [
-        {
-            "name": "default",
-            "platforms": ["a2a3sim", "a2a3"],
-            "config": {"aicpu_thread_num": 4, "block_dim": 3},
-            "params": {},
-        }
-    ]
-
-    def generate_args(self, params):
-        A = torch.randn(BATCH, GRID_M, GRID_K, TILE_M, TILE_K, dtype=torch.float32) * 0.01
-        B = torch.randn(BATCH, GRID_K, GRID_N, TILE_K, TILE_N, dtype=torch.float32) * 0.01
-        C = torch.zeros(BATCH, GRID_M, GRID_N, TILE_M, TILE_N, dtype=torch.float32)
-        return TaskArgsBuilder(Tensor("A", A.flatten()), Tensor("B", B.flatten()), Tensor("C", C.flatten()))
-
-    def compute_golden(self, args, params):
-        A = args.A.reshape(BATCH, GRID_M, GRID_K, TILE_M, TILE_K)
-        B = args.B.reshape(BATCH, GRID_K, GRID_N, TILE_K, TILE_N)
-        C = args.C.reshape(BATCH, GRID_M, GRID_N, TILE_M, TILE_N)
-        C[:] = 0.0
-        for batch in range(BATCH):
-            for m in range(GRID_M):
-                for n in range(GRID_N):
-                    for k in range(GRID_K):
-                        C[batch, m, n] += torch.matmul(A[batch, m, k], B[batch, k, n])
-
-
-if __name__ == "__main__":
-    SceneTestCase.run_module(__name__)
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
index 4bb21f68b..8fa605e68 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/aiv/aiv_softmax_prepare.cpp
@@ -120,7 +120,6 @@ static __aicore__ void softmax_prepare_impl(
     TSTORE(mijGlobal, maxTile);
     wait_flag(PIPE_V, PIPE_MTE3, EVENT_ID1);
     TSTORE(lijGlobal, sumTile);
-    TSTORE(pijGlobal, pijF16Tile);
 
     set_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
     wait_flag(PIPE_MTE3, PIPE_S, EVENT_ID7);
diff --git a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
index b3314019a..c59abecf2 100644
--- a/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
+++ b/examples/a5/tensormap_and_ringbuffer/paged_attention/kernels/orchestration/paged_attention_orch.cpp
@@ -39,7 +39,15 @@ inline double cycles_to_us(uint64_t cycles) {
 
 inline uint64_t get_sys_cnt_aicpu() {
     uint64_t ticks;
+#if defined(__aarch64__)
     asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
+#elif defined(__x86_64__)
+    unsigned int lo, hi;
+    asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
+    ticks = (static_cast<uint64_t>(hi) << 32) | lo;
+#else
+    ticks = 0;
+#endif
     return ticks;
 }
 
@@ -87,7 +95,7 @@ __attribute__((visibility("default"))) void build_paged_attention_graph(const Ch
     uint64_t scale_value = orch_args.scalar(0);
 
     uint64_t q_head_num = num_heads;
-    uint64_t q_tile = std::min(num_heads, 128UL);
+    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
     uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
     CYCLE_COUNT_LAP(prof_param_extract);
 
diff --git a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
index 1460a588d..fba81681a 100644
--- a/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
+++ b/tests/st/a5/tensormap_and_ringbuffer/paged_attention_unroll/kernels/orchestration/paged_attention_orch.cpp
@@ -43,7 +43,15 @@ inline double cycles_to_us(uint64_t cycles) {
 
 inline uint64_t get_sys_cnt_aicpu() {
     uint64_t ticks;
+#if defined(__aarch64__)
     asm volatile("mrs %0, cntvct_el0" : "=r"(ticks));
+#elif defined(__x86_64__)
+    unsigned int lo, hi;
+    asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
+    ticks = (static_cast<uint64_t>(hi) << 32) | lo;
+#else
+    ticks = 0;
+#endif
     return ticks;
 }
 
@@ -105,7 +113,7 @@ __attribute__((visibility("default"))) void build_paged_attention_graph(const Ch
     // scale from scalar arg
     uint64_t scale_value = orch_args.scalar(0);
     uint64_t q_head_num = num_heads;
-    uint64_t q_tile = std::min(num_heads, 128UL);
+    uint64_t q_tile = std::min(num_heads, static_cast<uint64_t>(128));
     uint64_t q_loop = (q_head_num + q_tile - 1) / q_tile;
     CYCLE_COUNT_LAP(prof_param_extract);
 

From 76d9ed152fec132939d7a48a7caafb30dbac7117 Mon Sep 17 00:00:00 2001
From: majin0824 <majin15@huawei.com>
Date: Fri, 17 Apr 2026 18:15:59 +0800
Subject: [PATCH 5/5] Refactor: remove legacy run_example.py and ci.py runners

All golden.py-based tests have been migrated to pytest @scene_test
format, making both legacy runners dead code.

- Delete examples/scripts/run_example.py and ci.py
- Remove ci.py steps from CI workflow; add --clone-protocol https to
  all first-attempt pytest calls so PTO-ISA clones via HTTPS in CI
- Update conftest.py to pre-clone PTO-ISA when --clone-protocol is
  non-default, replacing ci.py's pre-clone responsibility
- Update ci.sh run_task() to use test_*.py instead of run_example.py
- Remove run_example.py fallback from tools/benchmark_rounds.sh
- Remove ci.py and run_example.py smoke tests from verify_packaging.sh
---
 .github/workflows/ci.yml        |   25 +-
 ci.py                           | 1326 -------------------------------
 ci.sh                           |   13 +-
 conftest.py                     |    8 +-
 examples/scripts/run_example.py |  316 --------
 tools/benchmark_rounds.sh       |   13 +-
 tools/verify_packaging.sh       |    6 -
 7 files changed, 23 insertions(+), 1684 deletions(-)
 delete mode 100644 ci.py
 delete mode 100644 examples/scripts/run_example.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2869c3f93..6eadda84c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -207,13 +207,10 @@ jobs:
           pip install torch --index-url https://download.pytorch.org/whl/cpu
           pip install '.[test]'
 
-      - name: Run simulation examples (a2a3sim)
-        run: python ci.py -p a2a3sim -c d96c8784 -t 600 --clone-protocol https
-
       - name: Run pytest scene tests (a2a3sim)
         run: |
           set +e
-          pytest examples tests/st --platform a2a3sim --device 0-15 -v --pto-session-timeout 600
+          pytest examples tests/st --platform a2a3sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https
           rc=$?
           if [ $rc -eq 124 ]; then
             echo "pytest timed out; retrying with pinned PTO-ISA commit"
@@ -266,13 +263,10 @@ jobs:
           pip install torch --index-url https://download.pytorch.org/whl/cpu
           pip install '.[test]'
 
-      - name: Run simulation examples (a5sim)
-        run: python ci.py -p a5sim -c 3cf259e8 -t 600 --clone-protocol https
-
       - name: Run pytest scene tests (a5sim)
         run: |
           set +e
-          pytest examples tests/st --platform a5sim --device 0-15 -v --pto-session-timeout 600
+          pytest examples tests/st --platform a5sim --device 0-15 -v --pto-session-timeout 600 --clone-protocol https
           rc=$?
           if [ $rc -eq 124 ]; then
             echo "pytest timed out; retrying with pinned PTO-ISA commit"
@@ -319,17 +313,12 @@ jobs:
           pip install --upgrade pip
           pip install '.[test]'
 
-      - name: Run on-device examples (a2a3)
-        run: |
-          source .venv/bin/activate
-          source ${ASCEND_HOME_PATH}/bin/setenv.bash && python ci.py -p a2a3 -d ${DEVICE_RANGE} -c d96c8784 -t 600 --clone-protocol https
-
       - name: Run pytest scene tests (a2a3)
         run: |
           set +e
           source .venv/bin/activate
           source ${ASCEND_HOME_PATH}/bin/setenv.bash
-          python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v --pto-session-timeout 600
+          python -m pytest examples tests/st --platform a2a3 --device ${DEVICE_RANGE} -v --pto-session-timeout 600 --clone-protocol https
           rc=$?
           if [ $rc -eq 124 ]; then
             echo "pytest timed out; retrying with pinned PTO-ISA commit"
@@ -407,15 +396,9 @@ jobs:
           source ${ASCEND_HOME_PATH}/bin/setenv.bash
           pip install '.[test]'
 
-      - name: Run on-device examples (a5)
-        run: |
-          source ${ASCEND_HOME_PATH}/bin/setenv.bash
-          DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))")
-          task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "python ci.py -p a5 -d ${DEVICE_RANGE} -c d96c8784 -t 1200 --clone-protocol https"
-
       - name: Run pytest scene tests (a5)
         run: |
           source ${ASCEND_HOME_PATH}/bin/setenv.bash
           DEVICE_LIST=$(python -c "s,e='${DEVICE_RANGE}'.split('-'); print(','.join(str(i) for i in range(int(s),int(e)+1)))")
-          PYTEST="python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v"
+          PYTEST="python -m pytest examples tests/st --platform a5 --device ${DEVICE_RANGE} -v --clone-protocol https"
           task-submit --timeout 1800 --max-time 1800 --device "$DEVICE_LIST" --run "set +e; $PYTEST --pto-session-timeout 1200; rc=\$?; if [ \$rc -eq 124 ]; then echo 'pytest timed out; retrying with pinned PTO-ISA commit'; $PYTEST --pto-session-timeout 1200 --pto-isa-commit d96c8784 --clone-protocol https; rc=\$?; fi; exit \$rc"
diff --git a/ci.py b/ci.py
deleted file mode 100644
index 931920275..000000000
--- a/ci.py
+++ /dev/null
@@ -1,1326 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Batch CI test runner using ChipWorker for efficient device reuse.
-
-Replaces ci.sh by running all test tasks (sim + HW) in a single Python process
-per device, reusing ChipWorker across tasks that share the same runtime.
-
-Usage:
-    python ci.py                                                    # all sim platforms
-    python ci.py -p a2a3sim -r tensormap_and_ringbuffer -c 6622890  # single platform
-    python ci.py -p a2a3 -d 5-8 -c 6622890 -t 600                  # hardware with devices
-"""
-
-from __future__ import annotations
-
-import os
-import sys
-
-# ---------------------------------------------------------------------------
-# macOS libomp collision workaround — MUST run before any import that may
-# transitively load numpy or torch.  See docs/macos-libomp-collision.md for
-# the full analysis.
-#
-# On macOS with a --system-site-packages venv, homebrew's numpy pulls in
-# /opt/homebrew/opt/libomp/lib/libomp.dylib (via openblas), while pip's
-# torch ships its own .venv/.../torch/lib/libomp.dylib under a different
-# install name (/opt/llvm-openmp/lib/libomp.dylib).  Because the two
-# dylibs have distinct install names, dyld loads them both, and Intel's
-# libomp aborts the process with "OMP: Error #15 ... libomp already
-# initialized" (SIGABRT).
-#
-# The officially-documented escape hatch is KMP_DUPLICATE_LIB_OK=TRUE.
-# For our CI workload (numpy random + torch golden compute, no heavy
-# parallel OMP regions) the two runtimes never actually race, so allowing
-# the duplicate load is safe in practice.
-# ---------------------------------------------------------------------------
-if sys.platform == "darwin":
-    os.environ.setdefault("KMP_DUPLICATE_LIB_OK", "TRUE")
-
-import argparse
-import importlib.util
-import json
-import logging
-import signal
-import subprocess
-import tempfile
-import time
-from concurrent.futures import ThreadPoolExecutor
-from dataclasses import asdict, dataclass, field
-from pathlib import Path
-from queue import Empty, Queue
-from threading import Lock, Thread
-from typing import Any, Callable, Protocol, cast
-
-from simpler.task_interface import (  # type: ignore[import-not-found]
-    ChipCallable,  # pyright: ignore[reportAttributeAccessIssue]
-    ChipCallConfig,  # pyright: ignore[reportAttributeAccessIssue]
-    ChipStorageTaskArgs,  # pyright: ignore[reportAttributeAccessIssue]
-    ChipWorker,  # pyright: ignore[reportAttributeAccessIssue]
-    CoreCallable,  # pyright: ignore[reportAttributeAccessIssue]
-    make_tensor_arg,
-    scalar_to_uint64,
-)
-
-from simpler_setup.log_config import DEFAULT_LOG_LEVEL, LOG_LEVEL_CHOICES, configure_logging
-
-PROJECT_ROOT = Path(__file__).resolve().parent
-
-logger = logging.getLogger("ci")
-
-# ---------------------------------------------------------------------------
-# Data classes
-# ---------------------------------------------------------------------------
-
-EXAMPLES_DIR = PROJECT_ROOT / "examples"
-DEVICE_TESTS_DIR = PROJECT_ROOT / "tests" / "st"
-MAX_RETRIES = 3
-
-
-@dataclass
-class TaskSpec:
-    name: str
-    task_dir: Path
-    kernels_dir: Path
-    golden_path: Path
-    platform: str
-    runtime_name: str
-
-
-class BinaryArtifactPathLike(Protocol):
-    def read_bytes(self) -> bytes: ...
-
-    def __str__(self) -> str: ...
-
-
-class RuntimeBinariesLike(Protocol):
-    host_path: BinaryArtifactPathLike
-    aicpu_path: BinaryArtifactPathLike
-    aicore_path: BinaryArtifactPathLike
-    sim_context_path: Any
-
-
-class GoldenModuleLike(Protocol):
-    def generate_inputs(self, params: dict[str, Any]) -> object: ...
-
-    def compute_golden(self, tensors: dict[str, Any], params: dict[str, Any]) -> None: ...
-
-
-@dataclass
-class CompiledTask:
-    spec: TaskSpec
-    chip_callable: Any  # ChipCallable
-    cases: list[dict[str, Any]]
-    runtime_bins: Any
-    golden_module: Any
-    kernel_config: Any
-    rtol: float = 1e-5
-    atol: float = 1e-5
-    output_names: list[str] = field(default_factory=list)
-
-
-@dataclass
-class TaskResult:
-    name: str
-    platform: str
-    passed: bool
-    device: str
-    attempt: int
-    elapsed_s: float
-    error: str | None = None
-
-
-# ---------------------------------------------------------------------------
-# Module loading helpers (from code_runner.py)
-# ---------------------------------------------------------------------------
-
-
-def _load_module(path: Path, name: str):
-    spec = importlib.util.spec_from_file_location(name, path)
-    if spec is None or spec.loader is None:
-        raise ImportError(f"Cannot load module from {path}")
-    mod = importlib.util.module_from_spec(spec)
-    sys.modules[name] = mod
-    spec.loader.exec_module(mod)
-    return mod
-
-
-def _write_results_json(results: list[TaskResult], output_path: str | None) -> None:
-    if output_path is None:
-        return
-    Path(output_path).write_text(json.dumps([asdict(result) for result in results], indent=2) + "\n")
-
-
-def _read_results_json(result_path: Path) -> list[TaskResult]:
-    if not result_path.is_file():
-        return []
-    raw = result_path.read_text().strip()
-    if not raw:
-        return []
-    try:
-        payload = json.loads(raw)
-    except json.JSONDecodeError:
-        logger.warning("Ignoring invalid result JSON from %s", result_path)
-        return []
-    return [TaskResult(**item) for item in payload]
-
-
-def _write_task_list_json(tasks: list[TaskSpec], output_path: str | None) -> None:
-    if output_path is None:
-        return
-    Path(output_path).write_text(json.dumps([task.name for task in tasks], indent=2) + "\n")
-
-
-def _read_task_list_json(task_list_path: str | None) -> set[str] | None:
-    if task_list_path is None:
-        return None
-    path = Path(task_list_path)
-    if not path.is_file():
-        return None
-    return set(json.loads(path.read_text()))
-
-
-# ---------------------------------------------------------------------------
-# Task discovery
-# ---------------------------------------------------------------------------
-
-
-def _discover_runtimes_for_platform(platform: str) -> list[str]:
-    from simpler_setup.platform_info import discover_runtimes, parse_platform  # noqa: PLC0415
-
-    arch, _ = parse_platform(platform)
-    return discover_runtimes(arch)
-
-
-def discover_tasks(platform: str, runtime_filter: str | None = None) -> list[TaskSpec]:
-    """Scan examples/ and tests/st/ for test directories matching the given platform."""
-    from simpler_setup.platform_info import parse_platform  # noqa: PLC0415
-
-    arch, variant = parse_platform(platform)
-    is_sim = variant == "sim"
-    supported_runtimes = set(_discover_runtimes_for_platform(platform))
-
-    if runtime_filter:
-        if runtime_filter not in supported_runtimes:
-            raise ValueError(
-                f"Runtime '{runtime_filter}' not available for '{platform}'. Available: {sorted(supported_runtimes)}"
-            )
-        supported_runtimes = {runtime_filter}
-
-    tasks: list[TaskSpec] = []
-
-    search_dirs = [EXAMPLES_DIR]
-    if not is_sim:
-        search_dirs.append(DEVICE_TESTS_DIR)
-
-    for base_dir in search_dirs:
-        if not base_dir.is_dir():
-            continue
-        arch_dir = base_dir / arch
-        if not arch_dir.is_dir():
-            continue
-        for runtime_dir in sorted(arch_dir.iterdir()):
-            if not runtime_dir.is_dir():
-                continue
-            rt_name = runtime_dir.name
-            if rt_name not in supported_runtimes:
-                continue
-            for example_dir in sorted(runtime_dir.iterdir()):
-                if not example_dir.is_dir():
-                    continue
-                kernels_dir = example_dir / "kernels"
-                golden_path = example_dir / "golden.py"
-                kernel_config_path = kernels_dir / "kernel_config.py"
-                if not (kernel_config_path.is_file() and golden_path.is_file()):
-                    continue
-
-                rel = example_dir.relative_to(base_dir)
-                prefix = "device_test" if base_dir == DEVICE_TESTS_DIR else "example"
-                name = f"{prefix}:{rel}"
-
-                tasks.append(
-                    TaskSpec(
-                        name=name,
-                        task_dir=example_dir,
-                        kernels_dir=kernels_dir,
-                        golden_path=golden_path,
-                        platform=platform,
-                        runtime_name=rt_name,
-                    )
-                )
-
-    return tasks
-
-
-# ---------------------------------------------------------------------------
-# PTO-ISA management (reuses code_runner logic)
-# ---------------------------------------------------------------------------
-
-
-def ensure_pto_isa(commit: str | None, clone_protocol: str) -> str:
-    from simpler_setup.pto_isa import ensure_pto_isa_root  # noqa: PLC0415
-
-    # update_if_exists=True: when no commit is pinned, fetch latest origin/HEAD
-    # so CI runs reproducibly track main rather than whatever local checkout
-    # happens to be on disk.
-    return ensure_pto_isa_root(
-        commit=commit,
-        clone_protocol=clone_protocol,
-        update_if_exists=True,
-        verbose=True,
-    )
-
-
-# ---------------------------------------------------------------------------
-# Compilation
-# ---------------------------------------------------------------------------
-
-
-def compile_task(
-    spec: TaskSpec,
-    pto_isa_root: str,
-    build_runtime: bool = False,
-    run_all_cases: bool = False,
-) -> CompiledTask:
-    """Compile orchestration + kernels for a single task, return CompiledTask."""
-    from simpler_setup.elf_parser import extract_text_section  # noqa: PLC0415
-    from simpler_setup.kernel_compiler import KernelCompiler  # noqa: PLC0415
-    from simpler_setup.runtime_builder import RuntimeBuilder  # noqa: PLC0415
-
-    # Load kernel_config and golden
-    kc = _load_module(spec.kernels_dir / "kernel_config.py", f"kc_{id(spec)}")
-    golden = _load_module(spec.golden_path, f"golden_{id(spec)}")
-
-    kernels = kc.KERNELS
-    orchestration = kc.ORCHESTRATION
-
-    builder = RuntimeBuilder(platform=spec.platform)
-    compiler = KernelCompiler(platform=spec.platform)
-
-    # Resolve runtime include dirs
-    from simpler_setup.platform_info import parse_platform  # noqa: PLC0415
-
-    arch, _ = parse_platform(spec.platform)
-    runtime_base = PROJECT_ROOT / "src" / arch / "runtime" / spec.runtime_name
-    build_config_path = runtime_base / "build_config.py"
-    runtime_include_dirs = []
-    if build_config_path.is_file():
-        bc = _load_module(build_config_path, f"bc_{id(spec)}")
-        aicore_cfg = bc.BUILD_CONFIG.get("aicore", {})
-        for p in aicore_cfg.get("include_dirs", []):
-            runtime_include_dirs.append(str((runtime_base / p).resolve()))
-    else:
-        runtime_include_dirs.append(str(runtime_base / "runtime"))
-    runtime_include_dirs.append(str(PROJECT_ROOT / "src" / "common" / "task_interface"))
-
-    is_sim = spec.platform.endswith("sim")
-
-    # Compile runtime + orch + kernels in parallel
-    def _build_runtime():
-        return builder.get_binaries(spec.runtime_name, build=build_runtime)
-
-    def _compile_orch():
-        return compiler.compile_orchestration(spec.runtime_name, orchestration["source"])
-
-    def _compile_kernel(kernel):
-        incore_o = compiler.compile_incore(
-            kernel["source"],
-            core_type=kernel["core_type"],
-            pto_isa_root=pto_isa_root,
-            extra_include_dirs=runtime_include_dirs,
-        )
-        kernel_bin = incore_o if is_sim else extract_text_section(incore_o)
-        sig = kernel.get("signature", [])
-        return (kernel["func_id"], CoreCallable.build(signature=sig, binary=kernel_bin))
-
-    max_w = 2 + len(kernels)
-    with ThreadPoolExecutor(max_workers=max_w) as pool:
-        fut_rt = pool.submit(_build_runtime)
-        fut_orch = pool.submit(_compile_orch)
-        fut_kernels = [pool.submit(_compile_kernel, k) for k in kernels]
-
-        runtime_bins = fut_rt.result()
-        orch_binary = fut_orch.result()
-        kernel_binaries = [f.result() for f in fut_kernels]
-
-    orch_sig = orchestration.get("signature", [])
-    callable_obj = ChipCallable.build(
-        signature=orch_sig,
-        func_name=orchestration["function_name"],
-        binary=orch_binary,
-        children=kernel_binaries,
-        config_name=orchestration.get("config_name", ""),
-    )
-
-    all_cases = getattr(golden, "ALL_CASES", {"Default": {}})
-    if run_all_cases:
-        cases = [{"name": name, **params} for name, params in all_cases.items()]
-    else:
-        default_case = getattr(golden, "DEFAULT_CASE", "Default")
-        cases = [{"name": default_case, **all_cases[default_case]}]
-
-    return CompiledTask(
-        spec=spec,
-        chip_callable=callable_obj,
-        cases=cases,
-        runtime_bins=runtime_bins,
-        golden_module=golden,
-        kernel_config=kc,
-        rtol=getattr(golden, "RTOL", 1e-5),
-        atol=getattr(golden, "ATOL", 1e-5),
-        output_names=getattr(golden, "__outputs__", []),
-    )
-
-
-def compile_all_tasks(
-    tasks: list[TaskSpec],
-    pto_isa_root: str,
-    build_runtime: bool = False,
-    run_all_cases: bool = False,
-    max_workers: int = 4,
-) -> list[CompiledTask]:
-    """Compile all tasks in parallel. Returns list in same order as input."""
-    compiled: list[CompiledTask | None] = [None] * len(tasks)
-    errors: list[tuple[int, Exception]] = []
-    lock = Lock()
-
-    def _do(idx: int):
-        try:
-            result = compile_task(tasks[idx], pto_isa_root, build_runtime, run_all_cases)
-            with lock:
-                compiled[idx] = result
-        except Exception as e:
-            with lock:
-                errors.append((idx, e))
-
-    with ThreadPoolExecutor(max_workers=max_workers) as pool:
-        list(pool.map(_do, range(len(tasks))))
-
-    if errors:
-        for idx, e in errors:
-            logger.error(f"Failed to compile {tasks[idx].name}: {e}")
-        raise RuntimeError(f"{len(errors)} task(s) failed to compile")
-
-    return cast(list[CompiledTask], compiled)
-
-
-# ---------------------------------------------------------------------------
-# Single task execution
-# ---------------------------------------------------------------------------
-
-
-def run_single_task(
-    task: CompiledTask,
-    worker,
-    device_id: int,
-) -> bool:
-    """Run all cases in a compiled task on a given worker. Returns True if all pass."""
-    import ctypes  # noqa: PLC0415
-
-    import torch  # noqa: PLC0415
-
-    from simpler_setup.code_runner import _kernel_config_runtime_env, _temporary_env  # noqa: PLC0415
-
-    golden_mod = cast(GoldenModuleLike, task.golden_module)
-    kc = task.kernel_config
-    runtime_config = getattr(kc, "RUNTIME_CONFIG", {})
-
-    run_env = _kernel_config_runtime_env(kc, task.spec.kernels_dir)
-
-    for params in task.cases:
-        result = golden_mod.generate_inputs(params)
-
-        if isinstance(result, list):
-            # New-style: flat argument list
-            orch_args = ChipStorageTaskArgs()
-            args = {}
-            inputs = {}
-            outputs = {}
-            output_set = set(task.output_names)
-
-            for item in result:
-                name, value = item
-                if isinstance(value, torch.Tensor):
-                    tensor = value.cpu().contiguous()
-                    args[name] = tensor
-                    orch_args.add_tensor(make_tensor_arg(tensor))
-                    if name in output_set:
-                        outputs[name] = tensor
-                    else:
-                        inputs[name] = tensor
-                elif isinstance(value, ctypes._SimpleCData):
-                    orch_args.add_scalar(scalar_to_uint64(value))
-                    args[name] = value.value
-                else:
-                    raise TypeError(f"Unsupported arg type for '{name}': {type(value)}")
-        else:
-            raise TypeError("Legacy dict-style generate_inputs not supported in ci.py; use list-style")
-
-        # Compute golden
-        golden_outputs = {k: v.clone() for k, v in outputs.items()}
-        golden_with_inputs = {**inputs, **golden_outputs}
-        golden_mod.compute_golden(golden_with_inputs, params)
-
-        # Run on device
-        config = ChipCallConfig()
-        config.block_dim = runtime_config.get("block_dim", 24)
-        config.aicpu_thread_num = runtime_config.get("aicpu_thread_num", 3)
-
-        with _temporary_env(run_env):
-            worker.run(task.chip_callable, orch_args, config)
-
-        # Compare
-        for name, actual_tensor in outputs.items():
-            actual = actual_tensor.cpu()
-            expected = golden_outputs[name].cpu()
-            if not torch.allclose(actual, expected, rtol=task.rtol, atol=task.atol):
-                close_mask = torch.isclose(actual, expected, rtol=task.rtol, atol=task.atol)
-                mismatches = (~close_mask).sum().item()
-                total = actual.numel()
-                raise AssertionError(
-                    f"Output '{name}' mismatch in case '{params.get('name', '?')}': "
-                    f"{mismatches}/{total} elements differ (rtol={task.rtol}, atol={task.atol})"
-                )
-
-    return True
-
-
-# ---------------------------------------------------------------------------
-# Group tasks by runtime for ChipWorker reuse
-# ---------------------------------------------------------------------------
-
-
-def group_by_runtime(tasks: list[CompiledTask]) -> dict[str, list[CompiledTask]]:
-    groups: dict[str, list[CompiledTask]] = {}
-    for t in tasks:
-        groups.setdefault(t.spec.runtime_name, []).append(t)
-    return groups
-
-
-# ---------------------------------------------------------------------------
-# Device worker
-# ---------------------------------------------------------------------------
-
-
-def device_worker(
-    device_id: int,
-    task_queue: Queue,
-    results: list,
-    results_lock: Lock,
-    quarantined: set,
-    quarantine_lock: Lock,
-):
-    """Worker thread: pull tasks from queue, run them, handle retries."""
-    while True:
-        try:
-            item = task_queue.get_nowait()
-        except Empty:
-            break
-
-        runtime_name, compiled_tasks, attempt = item
-        rt_bins = cast(RuntimeBinariesLike, compiled_tasks[0].runtime_bins)
-
-        # Init worker for this runtime group
-        worker = ChipWorker()
-        try:
-            worker.init(
-                str(rt_bins.host_path),
-                str(rt_bins.aicpu_path),
-                str(rt_bins.aicore_path),
-                sim_context_lib_path=str(rt_bins.sim_context_path) if rt_bins.sim_context_path else "",
-            )
-            worker.set_device(device_id)
-        except Exception as e:
-            logger.error(f"[dev{device_id}] Failed to init ChipWorker for {runtime_name}: {e}")
-            for ct in compiled_tasks:
-                with results_lock:
-                    results.append(
-                        TaskResult(
-                            name=ct.spec.name,
-                            platform=ct.spec.platform,
-                            passed=False,
-                            device=str(device_id),
-                            attempt=attempt,
-                            elapsed_s=0,
-                            error=str(e),
-                        )
-                    )
-            with quarantine_lock:
-                quarantined.add(device_id)
-            task_queue.task_done()
-            break
-
-        failed_tasks = []
-        for ct in compiled_tasks:
-            start = time.monotonic()
-            logger.info(f"[dev{device_id}] Running: {ct.spec.name} (attempt {attempt})")
-            try:
-                run_single_task(ct, worker, device_id)
-                elapsed = time.monotonic() - start
-                logger.info(f"[dev{device_id}] PASS: {ct.spec.name} ({elapsed:.1f}s)")
-                with results_lock:
-                    results.append(
-                        TaskResult(
-                            name=ct.spec.name,
-                            platform=ct.spec.platform,
-                            passed=True,
-                            device=str(device_id),
-                            attempt=attempt,
-                            elapsed_s=elapsed,
-                        )
-                    )
-            except Exception as e:
-                elapsed = time.monotonic() - start
-                logger.error(f"[dev{device_id}] FAIL: {ct.spec.name} ({elapsed:.1f}s): {e}")
-                with results_lock:
-                    results.append(
-                        TaskResult(
-                            name=ct.spec.name,
-                            platform=ct.spec.platform,
-                            passed=False,
-                            device=str(device_id),
-                            attempt=attempt,
-                            elapsed_s=elapsed,
-                            error=str(e),
-                        )
-                    )
-                failed_tasks.append(ct)
-
-        worker.reset_device()
-        worker.finalize()
-
-        # Re-enqueue failed tasks for retry (individually, not as a group)
-        if failed_tasks and attempt + 1 < MAX_RETRIES:
-            for ct in failed_tasks:
-                task_queue.put((ct.spec.runtime_name, [ct], attempt + 1))
-        elif failed_tasks and attempt + 1 >= MAX_RETRIES:
-            logger.warning(f"[dev{device_id}] Quarantined after exhausting retries")
-            with quarantine_lock:
-                quarantined.add(device_id)
-            task_queue.task_done()
-            break
-
-        task_queue.task_done()
-
-
-# ---------------------------------------------------------------------------
-# Orchestrators: sim and HW
-# ---------------------------------------------------------------------------
-
-
-def run_hw_tasks(
-    compiled: list[CompiledTask],
-    devices: list[int],
-) -> list[TaskResult]:
-    """Run hardware tasks in-process with ChipWorker reuse per runtime group."""
-    groups = group_by_runtime(compiled)
-
-    task_queue: Queue = Queue()
-    for rt_name, tasks in groups.items():
-        task_queue.put((rt_name, tasks, 0))
-
-    results: list[TaskResult] = []
-    results_lock = Lock()
-    quarantined: set[int] = set()
-    quarantine_lock = Lock()
-
-    threads = []
-    for dev_id in devices:
-        t = Thread(
-            target=device_worker,
-            args=(dev_id, task_queue, results, results_lock, quarantined, quarantine_lock),
-        )
-        t.start()
-        threads.append(t)
-
-    for t in threads:
-        t.join()
-
-    if quarantined:
-        logger.warning("[hw] Quarantined devices: %s", sorted(quarantined))
-
-    return results
-
-
-def _build_device_worker_base_args(args: argparse.Namespace) -> list[str]:
-    base_args = [
-        sys.executable,
-        str(Path(__file__).resolve()),
-        "--device-worker",
-        "-p",
-        args.platform,
-        "--clone-protocol",
-        args.clone_protocol,
-    ]
-    if args.runtime:
-        base_args += ["-r", args.runtime]
-    if args.build_runtime:
-        base_args.append("--build-runtime")
-    if args.run_all_cases:
-        base_args.append("--all")
-    return base_args
-
-
-def _run_device_worker_subprocess(
-    tasks: list[TaskSpec],
-    device_id: int,
-    args: argparse.Namespace,
-    tag: str,
-    pto_isa_commit: str | None = None,
-    print_log_on_fail: bool = False,
-    quiet: bool = True,
-    timeout: int | None = None,
-) -> list[TaskResult]:
-    """Run a task batch in one device-worker subprocess and return its reported results.
-
-    When *quiet* is False, stdout streams to the terminal in real time
-    (useful for serial sim runs).  When True, output is captured and only
-    shown on failure if *print_log_on_fail* is set.
-    """
-    base_args = _build_device_worker_base_args(args)
-    if pto_isa_commit:
-        base_args += ["-c", pto_isa_commit]
-
-    with tempfile.NamedTemporaryFile(
-        prefix=f"ci_{tag}_tasks_dev{device_id}_",
-        suffix=".json",
-        delete=False,
-    ) as task_file:
-        task_list_path = Path(task_file.name)
-
-    with tempfile.NamedTemporaryFile(
-        prefix=f"ci_{tag}_dev{device_id}_",
-        suffix=".json",
-        delete=False,
-    ) as result_file:
-        result_path = Path(result_file.name)
-
-    _write_task_list_json(tasks, str(task_list_path))
-    full_cmd = base_args + [
-        "-d",
-        str(device_id),
-        "--task-list-json",
-        str(task_list_path),
-        "--result-json",
-        str(result_path),
-    ]
-
-    logger.info(f"[{tag}:dev{device_id}] Launching: {' '.join(full_cmd)}")
-    try:
-        if quiet:
-            proc = subprocess.run(full_cmd, check=False, capture_output=True, text=True, timeout=timeout)
-        else:
-            proc = subprocess.run(
-                full_cmd, check=False, stdout=None, stderr=subprocess.PIPE, text=True, timeout=timeout
-            )
-        device_results = _read_results_json(result_path)
-        if proc.returncode != 0:
-            if print_log_on_fail and quiet:
-                logger.error(f"[{tag}:dev{device_id}] Failed:\n{proc.stdout}\n{proc.stderr}")
-            elif print_log_on_fail and proc.stderr:
-                logger.error(f"[{tag}:dev{device_id}] stderr:\n{proc.stderr}")
-        # When the subprocess crashes without reporting per-task failures,
-        # generate FAIL results for every task that has no result yet so
-        # that pin-retry can match them by name.
-        if proc.returncode != 0 and not any(not r.passed for r in device_results):
-            reported_names = {r.name for r in device_results}
-            error_msg = (proc.stderr or proc.stdout or f"Device worker exited with code {proc.returncode}").strip()
-            for t in tasks:
-                if t.name not in reported_names:
-                    device_results.append(
-                        TaskResult(
-                            name=t.name,
-                            platform=t.platform,
-                            passed=False,
-                            device=str(device_id),
-                            attempt=0,
-                            elapsed_s=0,
-                            error=error_msg,
-                        )
-                    )
-        return device_results
-    except subprocess.TimeoutExpired:
-        logger.error(f"[{tag}:dev{device_id}] Subprocess timed out after {timeout}s")
-        device_results = _read_results_json(result_path)
-        reported_names = {r.name for r in device_results}
-        for t in tasks:
-            if t.name not in reported_names:
-                device_results.append(
-                    TaskResult(
-                        name=t.name,
-                        platform=t.platform,
-                        passed=False,
-                        device=str(device_id),
-                        attempt=0,
-                        elapsed_s=0,
-                        error=f"Timed out after {timeout}s",
-                    )
-                )
-        return device_results
-    finally:
-        task_list_path.unlink(missing_ok=True)
-        result_path.unlink(missing_ok=True)
-
-
-def _normalize_task_result(
-    task: TaskSpec,
-    device_id: int,
-    attempt: int,
-    task_results: list[TaskResult],
-) -> TaskResult:
-    matching = [result for result in task_results if result.name == task.name]
-    source = matching[-1] if matching else task_results[-1]
-    return TaskResult(
-        name=task.name,
-        platform=task.platform,
-        passed=source.passed,
-        device=str(device_id),
-        attempt=attempt,
-        elapsed_s=source.elapsed_s,
-        error=source.error,
-    )
-
-
-def run_hw_tasks_subprocess(
-    tasks: list[TaskSpec],
-    devices: list[int],
-    args: argparse.Namespace,
-    pto_isa_commit: str | None = None,
-) -> list[TaskResult]:
-    """Run hardware tasks: one subprocess per task.
-
-    On any failure the device is immediately quarantined (worker exits). Healthy
-    devices keep pulling from the shared queue. Tasks that were never run or failed
-    are collected so the caller can re-run them in a pin-commit pass with all devices
-    refreshed.
-    """
-    task_queue: Queue[tuple[TaskSpec, int]] = Queue()
-    total = len(tasks)
-    for task in tasks:
-        task_queue.put((task, 0))
-
-    results: list[TaskResult] = []
-    results_lock = Lock()
-    completed = [0]  # mutable counter for thread-safe increment
-    quarantined: set[int] = set()
-    quarantine_lock = Lock()
-    tag = "hw"
-
-    is_pin_retry = pto_isa_commit is not None
-
-    def _run_device(dev_id: int):
-        while True:
-            try:
-                task, attempt = task_queue.get_nowait()
-            except Empty:
-                return
-
-            is_last_attempt = attempt + 1 >= MAX_RETRIES
-            task_results = _run_device_worker_subprocess(
-                [task],
-                dev_id,
-                args,
-                tag=tag,
-                pto_isa_commit=pto_isa_commit,
-                print_log_on_fail=is_pin_retry and is_last_attempt,
-            )
-            normalized = _normalize_task_result(task, dev_id, attempt, task_results)
-            with results_lock:
-                results.append(normalized)
-                if normalized.passed or is_last_attempt:
-                    completed[0] += 1
-                n = completed[0]
-            status = "PASS" if normalized.passed else "FAIL"
-            attempt_info = f" attempt {attempt + 1}" if attempt > 0 else ""
-            logger.info(
-                f"[{tag}:dev{dev_id}] [{n}/{total}] {status}: {task.name}{attempt_info} ({normalized.elapsed_s:.1f}s)"
-            )
-
-            if normalized.passed:
-                continue
-
-            # Failure: re-enqueue with attempt+1 if under limit, quarantine this device
-            if not is_last_attempt:
-                task_queue.put((task, attempt + 1))
-            logger.warning(f"[{tag}:dev{dev_id}] Quarantined after failure on {task.name}")
-            with quarantine_lock:
-                quarantined.add(dev_id)
-            return
-
-    threads = [Thread(target=_run_device, args=(device_id,)) for device_id in devices]
-    for t in threads:
-        t.start()
-    for t in threads:
-        t.join()
-
-    # Tasks stranded in queue — all devices quarantined before queue emptied
-    while True:
-        try:
-            task, attempt = task_queue.get_nowait()
-        except Empty:
-            break
-        results.append(
-            TaskResult(
-                name=task.name,
-                platform=task.platform,
-                passed=False,
-                device="N/A",
-                attempt=attempt,
-                elapsed_s=0,
-                error="All devices quarantined",
-            )
-        )
-
-    if quarantined:
-        logger.warning(f"[{tag}] Quarantined devices: {sorted(quarantined)}")
-
-    return results
-
-
-# ---------------------------------------------------------------------------
-# Summary
-# ---------------------------------------------------------------------------
-
-
-def print_summary(results: list[TaskResult]) -> int:
-    """Print results table. Returns exit code (0 = all pass, 1 = failures)."""
-    # Deduplicate: keep last result per task name (retries produce multiple entries)
-    final: dict[str, TaskResult] = {}
-    for r in results:
-        final[r.name] = r
-
-    ordered = list(final.values())
-    pass_count = sum(1 for r in ordered if r.passed)
-    fail_count = sum(1 for r in ordered if not r.passed)
-    total = len(ordered)
-
-    is_tty = sys.stdout.isatty()
-    red = "\033[31m" if is_tty else ""
-    green = "\033[32m" if is_tty else ""
-    reset = "\033[0m" if is_tty else ""
-
-    # Column widths
-    name_w = max((len(r.name) for r in ordered), default=40)
-    name_w = max(40, min(72, name_w))
-
-    border = "=" * (name_w + 40)
-
-    # Print failure details first
-    for r in ordered:
-        if not r.passed and r.error:
-            print(f"\n--- FAIL: {r.name} (dev{r.device}, attempt {r.attempt + 1}) ---")
-            print(r.error)
-            print("--- END ---")
-
-    print(f"\n{border}")
-    print(f"{'CI RESULTS SUMMARY':^{len(border)}}")
-    print(border)
-    print(f"{'TASK':<{name_w}} {'PLATFORM':<10} {'DEVICE':<8} {'ATTEMPT':<8} {'TIME':<8} RESULT")
-    print(f"{'-' * name_w} {'-' * 10} {'-' * 8} {'-' * 8} {'-' * 8} ------")
-
-    for r in ordered:
-        name_display = r.name[: name_w - 3] + "..." if len(r.name) > name_w else r.name
-        status_str = f"{green}PASS{reset}" if r.passed else f"{red}FAIL{reset}"
-        print(
-            f"{name_display:<{name_w}} {r.platform:<10} {r.device:<8} "
-            f"{r.attempt + 1:<8} {r.elapsed_s:.0f}s{'':<5} {status_str}"
-        )
-
-    print(border)
-    print(f"Total: {total}  Passed: {pass_count}  Failed: {fail_count}")
-    print(border)
-
-    if fail_count == 0:
-        print("All tests passed!")
-        return 0
-    return 1
-
-
-# ---------------------------------------------------------------------------
-# PTO-ISA pin on failure (two-pass)
-# ---------------------------------------------------------------------------
-
-
-def reset_pto_isa(commit: str, clone_protocol: str) -> str:
-    """Checkout PTO-ISA at the pinned commit (or re-clone if needed)."""
-    from simpler_setup.pto_isa import checkout_pto_isa_commit, get_pto_isa_clone_path  # noqa: PLC0415
-
-    clone_path = get_pto_isa_clone_path()
-    if clone_path.exists():
-        checkout_pto_isa_commit(clone_path, commit, verbose=True)
-        return str(clone_path.resolve())
-    return ensure_pto_isa(commit, clone_protocol)
-
-
-# ---------------------------------------------------------------------------
-# Device-worker sub-command
-# ---------------------------------------------------------------------------
-
-
-def device_worker_main(args: argparse.Namespace) -> int:
-    """Entry point when invoked as --device-worker. Runs all tasks on one device."""
-    device_id = args.devices[0] if args.devices else 0
-    platform = args.platform
-
-    pto_isa_root = ensure_pto_isa(args.pto_isa_commit, args.clone_protocol)
-
-    tasks = discover_tasks(platform, runtime_filter=args.runtime)
-    selected_names = _read_task_list_json(args.task_list_json)
-    if selected_names is not None:
-        tasks = [task for task in tasks if task.name in selected_names]
-    if not tasks:
-        logger.info("No tasks found")
-        return 0
-
-    all_results = _run_tasks_on_device(tasks, device_id, platform, pto_isa_root, args)
-    _write_results_json(all_results, args.result_json)
-    return print_summary(all_results)
-
-
-def _run_tasks_on_device(
-    tasks: list[TaskSpec],
-    device_id: int,
-    platform: str,
-    pto_isa_root: str,
-    args: argparse.Namespace,
-) -> list[TaskResult]:
-    """Compile and run all tasks on a single device. Returns all TaskResults.
-
-    For simulation platforms with sufficient CPUs, tasks are distributed
-    across multiple virtual device IDs and executed in parallel threads.
-    ChipWorker.run() internally uses std::thread + join, so GIL is released
-    during execution, enabling true parallelism.
-    """
-    logger.info(f"Compiling {len(tasks)} tasks...")
-    try:
-        compiled = compile_all_tasks(
-            tasks, pto_isa_root, build_runtime=args.build_runtime, run_all_cases=args.run_all_cases
-        )
-    except RuntimeError:
-        return [
-            TaskResult(
-                name=t.name,
-                platform=platform,
-                passed=False,
-                device=str(device_id),
-                attempt=0,
-                elapsed_s=0,
-                error="compile failed",
-            )
-            for t in tasks
-        ]
-
-    is_sim = platform.endswith("sim")
-    if is_sim:
-        cpu_count = os.cpu_count() or 1
-        max_workers = min(max(cpu_count // 20, 1), len(compiled))
-    else:
-        max_workers = 1
-
-    if max_workers <= 1:
-        return _run_compiled_tasks(compiled, device_id, platform)
-
-    # Parallel: distribute tasks round-robin across virtual device IDs
-    buckets: list[list[CompiledTask]] = [[] for _ in range(max_workers)]
-    for i, ct in enumerate(compiled):
-        buckets[i % max_workers].append(ct)
-
-    logger.info(f"[sim] Parallel execution: {max_workers} workers, {len(compiled)} tasks")
-
-    results: list[TaskResult] = []
-    results_lock = Lock()
-    completed_count = [0]
-    total = len(compiled)
-
-    def _worker(worker_id: int, worker_tasks: list[CompiledTask]):
-        dev_id = worker_id
-        worker_results = _run_compiled_tasks(worker_tasks, dev_id, platform)
-        with results_lock:
-            for r in worker_results:
-                completed_count[0] += 1
-                n = completed_count[0]
-                results.append(r)
-                status = "PASS" if r.passed else "FAIL"
-                logger.info(f"[dev{dev_id}] [{n}/{total}] {status}: {r.name} ({r.elapsed_s:.1f}s)")
-
-    threads = []
-    for i in range(max_workers):
-        if not buckets[i]:
-            continue
-        t = Thread(target=_worker, args=(i, buckets[i]))
-        t.start()
-        threads.append(t)
-
-    for t in threads:
-        t.join()
-
-    return results
-
-
-def _run_compiled_tasks(
-    compiled: list[CompiledTask],
-    device_id: int,
-    platform: str,
-) -> list[TaskResult]:
-    """Run compiled tasks serially on a single device."""
-
-    groups = group_by_runtime(compiled)
-    all_results: list[TaskResult] = []
-
-    for rt_name, group_tasks in groups.items():
-        rt_bins = cast(RuntimeBinariesLike, group_tasks[0].runtime_bins)
-        worker = ChipWorker()
-        try:
-            worker.init(
-                str(rt_bins.host_path),
-                str(rt_bins.aicpu_path),
-                str(rt_bins.aicore_path),
-                sim_context_lib_path=str(rt_bins.sim_context_path) if rt_bins.sim_context_path else "",
-            )
-            worker.set_device(device_id)
-        except Exception as e:
-            logger.error(f"[dev{device_id}] Failed to init ChipWorker for {rt_name}: {e}")
-            all_results.extend(
-                TaskResult(
-                    name=ct.spec.name,
-                    platform=platform,
-                    passed=False,
-                    device=str(device_id),
-                    attempt=0,
-                    elapsed_s=0,
-                    error=str(e),
-                )
-                for ct in group_tasks
-            )
-            continue
-
-        for ct in group_tasks:
-            start = time.monotonic()
-            try:
-                run_single_task(ct, worker, device_id)
-                elapsed = time.monotonic() - start
-                logger.info(f"[dev{device_id}] PASS: {ct.spec.name} ({elapsed:.1f}s)")
-                all_results.append(
-                    TaskResult(
-                        name=ct.spec.name,
-                        platform=platform,
-                        passed=True,
-                        device=str(device_id),
-                        attempt=0,
-                        elapsed_s=elapsed,
-                    )
-                )
-            except Exception as e:
-                elapsed = time.monotonic() - start
-                logger.error(f"[dev{device_id}] FAIL: {ct.spec.name} ({elapsed:.1f}s): {e}")
-                all_results.append(
-                    TaskResult(
-                        name=ct.spec.name,
-                        platform=platform,
-                        passed=False,
-                        device=str(device_id),
-                        attempt=0,
-                        elapsed_s=elapsed,
-                        error=str(e),
-                    )
-                )
-
-        worker.reset_device()
-        worker.finalize()
-
-    return all_results
-
-
-# ---------------------------------------------------------------------------
-# Main
-# ---------------------------------------------------------------------------
-
-
-def _discover_valid_platforms() -> list[str]:
-    """Discover valid platforms from src/ directory structure (mirrors ci.sh logic)."""
-    platforms = []
-    src_dir = PROJECT_ROOT / "src"
-    if not src_dir.is_dir():
-        return platforms
-    for arch_dir in sorted(src_dir.iterdir()):
-        if not arch_dir.is_dir():
-            continue
-        arch = arch_dir.name
-        platform_dir = arch_dir / "platform"
-        if (platform_dir / "onboard").is_dir():
-            platforms.append(arch)
-        if (platform_dir / "sim").is_dir():
-            platforms.append(f"{arch}sim")
-    return platforms
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(description="Batch CI test runner with ChipWorker reuse")
-    parser.add_argument("-p", "--platform", default=None)
-    parser.add_argument("-d", "--device", dest="device_range", default="0")
-    parser.add_argument("-r", "--runtime", default=None)
-    parser.add_argument(
-        "--build-runtime",
-        action="store_true",
-        help="Rebuild runtime binaries from src/ instead of using pre-built build/lib artifacts",
-    )
-    parser.add_argument("-c", "--pto-isa-commit", default=None)
-    parser.add_argument("-t", "--timeout", type=int, default=600)
-    parser.add_argument("--clone-protocol", choices=["ssh", "https"], default="ssh")
-    parser.add_argument("--all", dest="run_all_cases", action="store_true", help="Run all cases, not just DEFAULT_CASE")
-    parser.add_argument(
-        "--log-level", choices=LOG_LEVEL_CHOICES, default=DEFAULT_LOG_LEVEL, help="Root logger level (default: info)"
-    )
-    parser.add_argument("--device-worker", action="store_true", help=argparse.SUPPRESS)
-    parser.add_argument("--result-json", default=None, help=argparse.SUPPRESS)
-    parser.add_argument("--task-list-json", default=None, help=argparse.SUPPRESS)
-    return parser.parse_args()
-
-
-def parse_device_range(device_range: str) -> list[int]:
-    if "-" in device_range:
-        start, end = device_range.split("-", 1)
-        return list(range(int(start), int(end) + 1))
-    return [int(device_range)]
-
-
-def _run_with_timeout(
-    phase_name: str,
-    timeout_s: int,
-    runner: Callable[[], list[TaskResult]],
-) -> list[TaskResult]:
-    def _watchdog_handler(signum, frame):
-        print(f"\n{'=' * 40}", flush=True)
-        print(
-            f"[CI] TIMEOUT: {phase_name} exceeded {timeout_s}s ({timeout_s // 60}min) limit, aborting",
-            flush=True,
-        )
-        print(f"{'=' * 40}", flush=True)
-        os._exit(1)
-
-    previous_handler = signal.getsignal(signal.SIGALRM)
-    signal.signal(signal.SIGALRM, _watchdog_handler)
-    signal.alarm(timeout_s)
-    try:
-        return runner()
-    finally:
-        signal.alarm(0)
-        signal.signal(signal.SIGALRM, previous_handler)
-
-
-def _run_single_platform(platform: str, args: argparse.Namespace) -> list[TaskResult]:
-    """Run all tasks for a single platform. Returns list of TaskResults."""
-    is_sim = platform.endswith("sim")
-
-    # Ensure PTO-ISA is available before task discovery so that downstream
-    # pytest scene tests (which share the same clone path) can find it even
-    # when ci.py itself has no tasks to run.
-    ensure_pto_isa(args.pto_isa_commit, args.clone_protocol)
-
-    tasks = discover_tasks(platform, runtime_filter=args.runtime)
-    if not tasks:
-        logger.info(f"[{platform}] No tasks found")
-        return []
-    logger.info(f"[{platform}] Discovered {len(tasks)} tasks")
-
-    # Compile and run via subprocess isolation.
-    # Sim: single subprocess with all tasks (ChipWorker reuse + parallel within).
-    # HW: one subprocess per task with device-level quarantine.
-    sub_args = argparse.Namespace(**vars(args))
-    sub_args.platform = platform
-    if is_sim:
-        all_results = _run_device_worker_subprocess(tasks, 0, sub_args, tag="sim", timeout=args.timeout, quiet=False)
-    else:
-        all_results = _run_with_timeout(
-            f"{platform} initial pass",
-            args.timeout,
-            lambda: run_hw_tasks_subprocess(tasks, args.devices, sub_args),
-        )
-
-    # Pin retry — re-run failed tasks with pinned PTO-ISA commit.
-    final: dict[str, TaskResult] = {}
-    for r in all_results:
-        final[r.name] = r
-    failures = [r for r in final.values() if not r.passed]
-
-    if failures and args.pto_isa_commit:
-        failed_names = {r.name for r in failures}
-        failed_tasks = [t for t in tasks if t.name in failed_names]
-        logger.info(f"[{platform}] {len(failed_tasks)} failure(s), retrying with pinned PTO-ISA {args.pto_isa_commit}")
-        if is_sim:
-            pin_results = _run_device_worker_subprocess(
-                failed_tasks,
-                0,
-                sub_args,
-                tag="sim",
-                pto_isa_commit=args.pto_isa_commit,
-                print_log_on_fail=True,
-                quiet=False,
-                timeout=args.timeout,
-            )
-        else:
-            pin_results = _run_with_timeout(
-                f"{platform} pin retry",
-                args.timeout,
-                lambda: run_hw_tasks_subprocess(
-                    failed_tasks,
-                    args.devices,
-                    sub_args,
-                    pto_isa_commit=args.pto_isa_commit,
-                ),
-            )
-        all_results.extend(pin_results)
-
-    return all_results
-
-
-def main() -> int:
-    args = parse_args()
-    configure_logging(args.log_level)
-    args.devices = parse_device_range(args.device_range)
-
-    valid_platforms = _discover_valid_platforms()
-
-    # Device-worker sub-command (always needs explicit -p)
-    if args.device_worker:
-        if not args.platform:
-            print("--device-worker requires -p/--platform")
-            return 1
-        return device_worker_main(args)
-
-    # Determine which platforms to run
-    if args.platform:
-        if args.platform not in valid_platforms:
-            print(f"Unknown platform: {args.platform}")
-            print(f"Valid platforms: {' '.join(valid_platforms)}")
-            return 1
-        platforms = [args.platform]
-    else:
-        # No -p: run all sim platforms
-        platforms = [p for p in valid_platforms if p.endswith("sim")]
-        if not platforms:
-            print("No sim platforms found")
-            return 1
-        logger.info(f"No platform specified, running all sim platforms: {', '.join(platforms)}")
-
-    all_results: list[TaskResult] = []
-    for platform in platforms:
-        all_results.extend(_run_single_platform(platform, args))
-
-    if not all_results:
-        logger.info("No tasks found")
-        return 0
-
-    return print_summary(all_results)
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/ci.sh b/ci.sh
index b46377c53..c15ef0c84 100755
--- a/ci.sh
+++ b/ci.sh
@@ -323,9 +323,16 @@ run_task() {
     local start_time=$SECONDS
 
     local -a cmd
-    cmd=(env PYTHONDONTWRITEBYTECODE=1 python examples/scripts/run_example.py
-        -k "${dir}/kernels" -g "${dir}/golden.py"
-        -p "$platform" --clone-protocol "$CLONE_PROTOCOL" "${commit_flag[@]}")
+    # Prefer test_*.py if available
+    local test_file
+    test_file=$(find "$dir" -maxdepth 1 -name 'test_*.py' -print -quit 2>/dev/null || true)
+    if [[ -n "$test_file" ]]; then
+        cmd=(env PYTHONDONTWRITEBYTECODE=1 python "$test_file"
+            -p "$platform" --clone-protocol "$CLONE_PROTOCOL" "${commit_flag[@]}")
+    else
+        echo "[${platform}] SKIP: no test_*.py found in $dir"
+        return 1
+    fi
     [[ -n "$device_id" ]] && cmd+=(-d "$device_id")
 
     # Progress to stdout (not captured in log)
diff --git a/conftest.py b/conftest.py
index 97e43d534..4ff21bfc4 100644
--- a/conftest.py
+++ b/conftest.py
@@ -146,13 +146,17 @@ def pytest_configure(config):
         os.environ["PTO_LOG_LEVEL"] = log_level
 
     commit = config.getoption("--pto-isa-commit")
-    if commit:
+    clone_protocol = config.getoption("--clone-protocol")
+    # Always pre-clone PTO-ISA so the clone_protocol is respected (CI needs
+    # https, but scene_test.py defaults to ssh).  Previously ci.py handled
+    # this; now conftest owns it.
+    if commit or clone_protocol != "ssh":
         from simpler_setup.pto_isa import ensure_pto_isa_root  # noqa: PLC0415
 
         root = ensure_pto_isa_root(
             verbose=True,
             commit=commit,
-            clone_protocol=config.getoption("--clone-protocol"),
+            clone_protocol=clone_protocol,
         )
         if root:
             os.environ["PTO_ISA_ROOT"] = root
diff --git a/examples/scripts/run_example.py b/examples/scripts/run_example.py
deleted file mode 100644
index e9eb24bdb..000000000
--- a/examples/scripts/run_example.py
+++ /dev/null
@@ -1,316 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (c) PyPTO Contributors.
-# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
-# CANN Open Software License Agreement Version 2.0 (the "License").
-# Please refer to the License for details. You may not use this file except in compliance with the License.
-# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
-# See LICENSE in the root of the software repository for the full text of the License.
-# -----------------------------------------------------------------------------------------------------------
-"""
-Simplified test runner for PTO runtime tests.
-
-This script provides a command-line interface to run PTO runtime tests
-with minimal configuration. Users only need to provide:
-1. A kernels directory with kernel_config.py
-2. A golden.py script
-
-Usage:
-    python examples/scripts/run_example.py --kernels ./my_test/kernels --golden ./my_test/golden.py
-    python examples/scripts/run_example.py -k ./kernels -g ./golden.py --device 0 --platform a2a3sim
-
-Examples:
-    # Run hardware example (requires Ascend device)
-    python examples/scripts/run_example.py -k examples/host_build_graph/vector_example/kernels \
-                                      -g examples/host_build_graph/vector_example/golden.py
-
-    # Run simulation example (no hardware required)
-    python examples/scripts/run_example.py -k examples/host_build_graph/vector_example/kernels \
-                                      -g examples/host_build_graph/vector_example/golden.py \
-                                      -p a2a3sim
-
-    # Run with specific device
-    python examples/scripts/run_example.py -k ./kernels -g ./golden.py -d 0
-"""
-
-import argparse
-import logging
-import os
-import sys
-import time
-from pathlib import Path
-
-from simpler_setup.code_runner import create_code_runner
-from simpler_setup.log_config import DEFAULT_LOG_LEVEL, LOG_LEVEL_CHOICES, configure_logging
-
-project_root = Path(__file__).parent.parent.parent
-
-logger = logging.getLogger(__name__)
-
-
-def _get_device_log_dir(device_id):
-    """Return the device log directory using the same logic as device_log_resolver."""
-    ascend_work_path = os.environ.get("ASCEND_WORK_PATH")
-    if ascend_work_path:
-        root = Path(ascend_work_path).expanduser() / "log" / "debug"
-        if root.exists():
-            return root / f"device-{device_id}"
-    return Path.home() / "ascend" / "log" / "debug" / f"device-{device_id}"
-
-
-def _wait_for_new_device_log(log_dir, pre_run_logs, timeout=15, interval=0.5):
-    """Wait for a new device log file that wasn't present before the run.
-
-    CANN dlog writes device logs asynchronously, so the file may appear
-    a few seconds after the run completes.
-    """
-    deadline = time.monotonic() + timeout
-    while time.monotonic() < deadline:
-        if log_dir.exists():
-            current_logs = set(log_dir.glob("*.log"))
-            new_logs = current_logs - pre_run_logs
-            if new_logs:
-                return max(new_logs, key=lambda p: p.stat().st_mtime)
-        time.sleep(interval)
-    return None
-
-
-def main():  # noqa: PLR0912
-    import warnings  # noqa: PLC0415
-
-    warnings.warn(
-        "run_example.py is deprecated. Use 'python test_*.py' with the same CLI options instead. "
-        "See docs/testing.md for details.",
-        DeprecationWarning,
-        stacklevel=1,
-    )
-
-    parser = argparse.ArgumentParser(
-        description="Run PTO runtime test with kernel config and golden script",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog="""
-Examples:
-    python examples/scripts/run_example.py --kernels ./my_test/kernels --golden ./my_test/golden.py
-    python examples/scripts/run_example.py -k ./kernels -g ./golden.py -d 0
-
-Golden.py interface:
-    def generate_inputs(params: dict) -> dict:
-        '''Return dict of torch tensors (inputs + outputs)'''
-        return {"a": torch.tensor(...), "out_f": torch.zeros(...)}
-
-    def compute_golden(tensors: dict, params: dict) -> None:
-        '''Compute expected outputs in-place'''
-        tensors["out_f"][:] = tensors["a"] + 1
-
-    # Optional — for parameterized test cases:
-    ALL_CASES = {"Case1": {"size": 1024}, "Case2": {"size": 2048}}
-    DEFAULT_CASE = "Case1"
-    RTOL = 1e-5  # Relative tolerance
-    ATOL = 1e-5  # Absolute tolerance
-    __outputs__ = ["out_f"]  # Or use 'out_' prefix
-        """,
-    )
-
-    parser.add_argument(
-        "-k",
-        "--kernels",
-        required=True,
-        help="Path to kernels directory containing kernel_config.py",
-    )
-
-    parser.add_argument("-g", "--golden", required=True, help="Path to golden.py script")
-
-    parser.add_argument("-d", "--device", type=int, default=0, help="Device ID (default: 0)")
-
-    parser.add_argument(
-        "-p",
-        "--platform",
-        default="a2a3",
-        choices=["a2a3", "a2a3sim", "a5", "a5sim"],
-        help="Platform name: 'a2a3'/'a5' for hardware, 'a2a3sim'/'a5sim' for simulation (default: a2a3)",
-    )
-
-    parser.add_argument(
-        "--log-level",
-        choices=LOG_LEVEL_CHOICES,
-        default=DEFAULT_LOG_LEVEL,
-        help=f"Root logger level (default: {DEFAULT_LOG_LEVEL})",
-    )
-
-    parser.add_argument(
-        "--enable-profiling",
-        action="store_true",
-        help="Enable profiling and generate swimlane.json",
-    )
-
-    parser.add_argument(
-        "--dump-tensor",
-        action="store_true",
-        help="Dump per-task tensor I/O at runtime (controlled by enable_dump_tensor flag)",
-    )
-
-    parser.add_argument(
-        "--all",
-        action="store_true",
-        help="Run all test cases defined in ALL_CASES (default: run only DEFAULT_CASE)",
-    )
-
-    parser.add_argument(
-        "--case",
-        type=str,
-        default=None,
-        help="Run a specific test case by name (e.g., --case Case2)",
-    )
-
-    parser.add_argument(
-        "-c",
-        "--pto-isa-commit",
-        type=str,
-        default=None,
-        help="Checkout PTO-ISA at this commit (e.g., -c 1b22fea)",
-    )
-
-    parser.add_argument(
-        "--rounds",
-        type=int,
-        default=None,
-        metavar="ROUNDS",
-        help="Number of rounds to run per case (overrides kernel_config RUNTIME_CONFIG['rounds'])",
-    )
-
-    parser.add_argument(
-        "--clone-protocol",
-        choices=["ssh", "https"],
-        default="ssh",
-        help="Git protocol for cloning pto-isa (default: ssh)",
-    )
-
-    parser.add_argument(
-        "--skip-golden",
-        action="store_true",
-        help="Skip golden computation and comparison (for benchmarking)",
-    )
-
-    parser.add_argument(
-        "--build",
-        action="store_true",
-        help="Compile runtime from source instead of using pre-built binaries",
-    )
-
-    args = parser.parse_args()
-
-    if args.all and args.case:
-        parser.error("--all and --case are mutually exclusive")
-
-    configure_logging(args.log_level)
-
-    if args.rounds is not None and args.rounds > 1 and args.enable_profiling:
-        logger.warning("Profiling disabled: --rounds > 1")
-        args.enable_profiling = False
-
-    # Validate paths
-    kernels_path = Path(args.kernels)
-    golden_path = Path(args.golden)
-
-    if not kernels_path.exists():
-        logger.error(f"Kernels directory not found: {kernels_path}")
-        return 1
-
-    if not golden_path.exists():
-        logger.error(f"Golden script not found: {golden_path}")
-        return 1
-
-    kernel_config_path = kernels_path / "kernel_config.py"
-    if not kernel_config_path.exists():
-        logger.error(f"kernel_config.py not found in {kernels_path}")
-        return 1
-
-    try:
-        runner = create_code_runner(
-            kernels_dir=str(args.kernels),
-            golden_path=str(args.golden),
-            device_id=args.device,
-            platform=args.platform,
-            enable_profiling=args.enable_profiling,
-            enable_dump_tensor=args.dump_tensor,
-            run_all_cases=args.all,
-            case_name=args.case,
-            pto_isa_commit=args.pto_isa_commit,
-            build_runtime=args.build,
-            repeat_rounds=args.rounds,
-            clone_protocol=args.clone_protocol,
-            skip_golden=args.skip_golden,
-        )
-
-        # Snapshot existing device logs before the run so we can identify the
-        # new log created by this run (CANN writes device logs asynchronously).
-        pre_run_device_logs = set()
-        device_log_dir = None
-        if args.enable_profiling and args.platform == "a2a3":
-            device_log_dir = _get_device_log_dir(args.device)
-            if device_log_dir.exists():
-                pre_run_device_logs = set(device_log_dir.glob("*.log"))
-
-        runner.run()
-        logger.info("=" * 60)
-        logger.info("TEST PASSED")
-        logger.info("=" * 60)
-
-        # If profiling was enabled, generate merged swimlane JSON
-        if args.enable_profiling:
-            logger.info("Generating swimlane visualization...")
-            kernel_config_path = kernels_path / "kernel_config.py"
-            swimlane_script = project_root / "tools" / "swimlane_converter.py"
-
-            if swimlane_script.exists():
-                import subprocess  # noqa: PLC0415
-
-                try:
-                    cmd = [
-                        sys.executable,
-                        str(swimlane_script),
-                        "-k",
-                        str(kernel_config_path),
-                    ]
-
-                    # Find the device log created by this run via snapshot diff
-                    if device_log_dir is not None:
-                        device_log_file = _wait_for_new_device_log(device_log_dir, pre_run_device_logs)
-                        if device_log_file:
-                            cmd += ["--device-log", str(device_log_file)]
-                        else:
-                            logger.warning("No new device log found, falling back to device-id")
-                            cmd += ["-d", str(args.device)]
-                    else:
-                        cmd += ["-d", str(args.device)]
-
-                    if logger.isEnabledFor(logging.DEBUG):
-                        cmd.append("-v")
-
-                    result = subprocess.run(cmd, check=True, capture_output=True, text=True)
-                    logger.info(result.stdout)
-                    logger.info("Swimlane JSON generation completed")
-                except subprocess.CalledProcessError as e:
-                    logger.warning(f"Failed to generate swimlane JSON: {e}")
-                    logger.debug(f"stderr: {e.stderr}")
-            else:
-                logger.warning(f"Swimlane converter script not found: {swimlane_script}")
-
-        return 0
-
-    except ImportError as e:
-        logger.error(f"Import error: {e}")
-        logger.error("Make sure you're running from the project root directory.")
-        return 1
-
-    except Exception as e:
-        logger.error(f"TEST FAILED: {e}")
-        if logger.isEnabledFor(logging.DEBUG):
-            import traceback  # noqa: PLC0415
-
-            traceback.print_exc()
-        return 1
-
-
-if __name__ == "__main__":
-    sys.exit(main())
diff --git a/tools/benchmark_rounds.sh b/tools/benchmark_rounds.sh
index 5e11f4327..e11048d1a 100755
--- a/tools/benchmark_rounds.sh
+++ b/tools/benchmark_rounds.sh
@@ -19,7 +19,6 @@ set -euo pipefail
 
 SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
 PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
-RUN_EXAMPLE="$PROJECT_ROOT/examples/scripts/run_example.py"
 
 # ---------------------------------------------------------------------------
 # Examples to benchmark and their case lists, per runtime.
@@ -388,7 +387,7 @@ run_bench() {
     trap 'rm -f -- "$pre_log_file"' RETURN
     ls -1 "$DEVICE_LOG_DIR"/*.log 2>/dev/null | sort > "$pre_log_file" || true
 
-    # Build run command: prefer test_*.py, fall back to run_example.py
+    # Build run command using test_*.py
     local test_file
     test_file=$(find "$example_dir" -maxdepth 1 -name 'test_*.py' -print -quit 2>/dev/null || true)
 
@@ -400,14 +399,8 @@ run_bench() {
             -n "$ROUNDS" --skip-golden
         )
     else
-        local kernels_dir="$example_dir/kernels"
-        local golden="$example_dir/golden.py"
-        run_cmd=(
-            python3 "$RUN_EXAMPLE"
-            -k "$kernels_dir" -g "$golden"
-            -p "$PLATFORM" -d "$DEVICE_ID"
-            -n "$ROUNDS" --skip-golden
-        )
+        echo "  SKIPPED: no test_*.py found in $example_dir"
+        return
     fi
     if [[ -n "$case_name" ]]; then
         run_cmd+=(--case "$case_name")
diff --git a/tools/verify_packaging.sh b/tools/verify_packaging.sh
index 2e897978a..c5da1f304 100755
--- a/tools/verify_packaging.sh
+++ b/tools/verify_packaging.sh
@@ -63,12 +63,6 @@ print('simpler_setup:', simpler_setup.__file__)
     echo "::group::[${mode}] standalone test_*.py --help"
     python tests/st/a2a3/aicpu_build_graph/paged_attention/test_paged_attention.py --help >/dev/null
     echo "::endgroup::"
-    echo "::group::[${mode}] ci.py --help"
-    python ci.py --help >/dev/null
-    echo "::endgroup::"
-    echo "::group::[${mode}] run_example.py --help"
-    python examples/scripts/run_example.py --help >/dev/null
-    echo "::endgroup::"
     echo "smoke[${mode}] OK"
 }